Full support of cl_intel_motion_estimation extension.

Submitted by Chuanbo Weng on Nov. 11, 2015, 9:22 a.m.

Details

Message ID 1447233749-16111-1-git-send-email-chuanbo.weng@intel.com
State New
Headers show
Series "Full support of cl_intel_motion_estimation extension." ( rev: 1 ) in Beignet

Not browsing as part of any series.

Commit Message

Chuanbo Weng Nov. 11, 2015, 9:22 a.m.
The following items are supported in this commit:
1. Return residuals.
2. All types of mb_block_type, subpixel_mode, sad_adjust_mode in
   cl_motion_estimation_desc_intel.
After this commit, cl_intel_motion_estimation is fully supported.

Signed-off-by: Chuanbo Weng <chuanbo.weng@intel.com>
---
 docs/howto/video-motion-estimation-howto.mdwn      |  26 +--
 .../cl_internal_block_motion_estimate_intel.cl     | 199 +++++++++++++++++----
 2 files changed, 175 insertions(+), 50 deletions(-)

Patch hide | download patch | download mbox

diff --git a/docs/howto/video-motion-estimation-howto.mdwn b/docs/howto/video-motion-estimation-howto.mdwn
index d9edc9b..8deaa61 100644
--- a/docs/howto/video-motion-estimation-howto.mdwn
+++ b/docs/howto/video-motion-estimation-howto.mdwn
@@ -1,21 +1,15 @@ 
 Video Motion Vector HowTo
 ==========================
 
-Beignet now supports cl_intel_accelerator and part of cl_intel_motion_estimation, which
-are Khronos official extensions. It provides a hardware acceleration of video motion
+Beignet now supports cl_intel_accelerator and cl_intel_motion_estimation, which are
+Khronos official extensions. It provides a hardware acceleration of video motion
 vector to users.
 
-Supported hardware platform and limitation
-------------------------------------------
+Supported hardware platform
+---------------------------
 
-Only 3rd Generation Intel Core Processors is supported for vme now. And now we just
-implement this part of cl_intel_motion_estimation for motion vector computation(residuals
-can not be returned yet) on 3rd Generation Intel Core Processors:
-  mb_block_type = CL_ME_MB_TYPE_16x16_INTEL
-  subpixel_mode = CL_ME_SUBPIXEL_MODE_INTEGER_INTEL
-  search_path_type = CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL / CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL
-                     / CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL
-We will fully support cl_intel_motion_estimation in the future.
+Only 3rd Generation Intel Core Processors is supported for vme now. We will consider
+to support more platforms if necessary.
 
 Steps
 -----
@@ -23,15 +17,13 @@  Steps
 In order to use video motion estimation provided by Beignet in your program, please follow
 the steps as below:
 
-- Create a cl_accelerator_intel object using extension API clCreateAcceleratorINTEL, with
-  the following parameters:
+- Create a cl_accelerator_intel object using extension API clCreateAcceleratorINTEL, like
+  this:
   _accelerator_type_intel accelerator_type = CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL;
   cl_motion_estimation_desc_intel vmedesc = {CL_ME_MB_TYPE_16x16_INTEL,
                                              CL_ME_SUBPIXEL_MODE_INTEGER_INTEL,
                                              CL_ME_SAD_ADJUST_MODE_NONE_INTEL,
-                                             CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL(
-                                             or CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL
-                                             or CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL)
+                                             CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL
                                             };
 
 - Invoke clCreateProgramWithBuiltInKernels to create a program object with built-in kernels
diff --git a/src/kernels/cl_internal_block_motion_estimate_intel.cl b/src/kernels/cl_internal_block_motion_estimate_intel.cl
index 5a22338..1f28f4e 100644
--- a/src/kernels/cl_internal_block_motion_estimate_intel.cl
+++ b/src/kernels/cl_internal_block_motion_estimate_intel.cl
@@ -59,23 +59,28 @@  void block_motion_estimate_intel(accelerator_intel_t accel,
   int lgid_x = get_group_id(0);
   int lgid_y = get_group_id(1);
 
+  int num_groups_x = get_num_groups(0);
+  int index = lgid_y * num_groups_x + lgid_x;
+
   uint2 srcCoord = 0;
+  short2 predict_mv = 0;
+  if(prediction_motion_vector_buffer != NULL){
+    predict_mv = prediction_motion_vector_buffer[index];
+    predict_mv.x = predict_mv.x / 4;
+    predict_mv.y = predict_mv.y / 4;
+  }
 
   srcCoord.x = lgid_x * 16;
   srcCoord.y = lgid_y * 16;
 
-  //TODO: This line of code is just to workaround a curbe related bug caused by commit 061d214a6fc2876a0e24e094f87f2a172984bc23
-  //After fix, this line should be removed.
-  src_grf0_dw5 = accel.mb_block_type;
-
   //CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL
   if(accel.search_path_type == 0x0){
-    //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id?);
+    //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id);
     src_grf0_dw5 =   (20 << 24)         | (20 << 16)        | (0 << 8)       | (0);
     //src_grf0_dw1 = (Ref1Y << 16)  | (Ref1X);
-    src_grf0_dw1 =   0xfffefffe;
+    src_grf0_dw1 =   ((-2 + predict_mv.y) << 16 ) | ((-2 + predict_mv.x) & 0x0000ffff);
     //src_grf0_dw0 = (Ref0Y << 16)  | (Ref0X);
-    src_grf0_dw0 =   0xfffefffe;
+    src_grf0_dw0 =   ((-2 + predict_mv.y) << 16 ) | ((-2 + predict_mv.x) & 0x0000ffff);
     //src_grf1_dw2 = (Start1Y << 28)                  | (Start1X << 24)                | (Start0Y << 20)
     src_grf1_dw2 =   (0 << 28)                        | (0 << 24)                      | (0 << 20)
                    //| (Start0X << 16)               | (Max_Num_SU << 8)              | (LenSP);
@@ -84,35 +89,59 @@  void block_motion_estimate_intel(accelerator_intel_t accel,
   //CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL
   else if(accel.search_path_type == 0x1){
     src_grf0_dw5 =   (24 << 24)         | (24 << 16)        | (0 << 8)       | (0);
-    src_grf0_dw1 =   0xfffcfffc;
-    src_grf0_dw0 =   0xfffcfffc;
+    src_grf0_dw1 =   ((-4 + predict_mv.y) << 16 ) | ((-4 + predict_mv.x) & 0x0000ffff);
+    src_grf0_dw0 =   ((-4 + predict_mv.y) << 16 ) | ((-4 + predict_mv.x) & 0x0000ffff);
     src_grf1_dw2 =   (0 << 28)                        | (0 << 24)                      | (0 << 20)
                      | (0 << 16)                     | (48 << 8)                       | (48);
   }
   //CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL
   else if(accel.search_path_type == 0x5){
     src_grf0_dw5 =   (40 << 24)         | (48 << 16)        | (0 << 8)       | (0);
-    src_grf0_dw1 =   0xfff4fff0;
-    src_grf0_dw0 =   0xfff4fff0;
+    src_grf0_dw1 =   ((-12 + predict_mv.y) << 16 ) | ((-16 + predict_mv.x) & 0x0000ffff);
+    src_grf0_dw0 =   ((-12 + predict_mv.y) << 16 ) | ((-16 +  + predict_mv.x) & 0x0000ffff);
     src_grf1_dw2 =   (0 << 28)                        | (0 << 24)                      | (0 << 20)
                      | (0 << 16)                     | (48 << 8)                       | (48);
   }
 
-  //src_grf0_dw7 = Debug;
-  src_grf0_dw7 = 0;
-  //src_grf0_dw6 = Debug;
-  src_grf0_dw6 = 0;
-  //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id?);
-  //src_grf0_dw4 = Ignored;
-  src_grf0_dw4 = 0;
-  //src_grf0_dw3 = (Reserved << 31)                 | (Sub_Mb_Part_Mask << 24)       | (Intra_SAD << 22)
-  src_grf0_dw3 =   (0 << 31)                 | (0x7e << 24)                   | (0 << 22)
+  /*Deal with mb_block_type & sad_adjust_mode & subpixel_mode*/
+  uchar sub_mb_part_mask = 0;
+  //CL_ME_MB_TYPE_16x16_INTEL
+  if(accel.mb_block_type == 0x0)
+    sub_mb_part_mask = 0x7e;
+  //CL_ME_MB_TYPE_8x8_INTEL
+  else if(accel.mb_block_type == 0x1)
+    sub_mb_part_mask = 0x77;
+  //CL_ME_MB_TYPE_4x4_INTEL
+  else if(accel.mb_block_type == 0x2)
+    sub_mb_part_mask = 0x3f;
+
+  uchar inter_sad = 0;
+  //CL_ME_SAD_ADJUST_MODE_NONE_INTEL
+  if(accel.sad_adjust_mode == 0x0)
+    inter_sad = 0;
+  //CL_ME_SAD_ADJUST_MODE_HAAR_INTEL
+  else if(accel.sad_adjust_mode == 0x1)
+    inter_sad = 2;
+
+  uchar sub_pel_mode = 0;
+  //CL_ME_SUBPIXEL_MODE_INTEGER_INTEL
+  if(accel.subpixel_mode == 0x0)
+    sub_pel_mode = 0;
+  //CL_ME_SUBPIXEL_MODE_HPEL_INTEL
+  else if(accel.subpixel_mode == 0x1)
+    sub_pel_mode = 1;
+  //CL_ME_SUBPIXEL_MODE_QPEL_INTEL
+  else if(accel.subpixel_mode == 0x2)
+    sub_pel_mode = 3;
+
+  //src_grf0_dw3 = (Reserved << 31)                | (Sub_Mb_Part_Mask << 24)       | (Intra_SAD << 22)
+  src_grf0_dw3 =   (0 << 31)                       | (sub_mb_part_mask << 24)       | (0 << 22)
                  //| (Inter_SAD << 20)             | (BB_Skip_Enabled << 19)        | (Reserverd << 18)
-                   | (0 << 20)                     | (0 << 19)                      | (0 << 18)
+                   | (inter_sad << 20)             | (0 << 19)                      | (0 << 18)
                  //| (Dis_Aligned_Src_Fetch << 17) | (Dis_Aligned_Ref_Fetch << 16)  | (Dis_Field_Cache_Alloc << 15)
                    | (0 << 17)                     | (0 << 16)                      | (0 << 15)
                  //| (Skip_Type << 14)             | (Sub_Pel_Mode << 12)           | (Dual_Search_Path_Opt << 11)
-                   | (0 << 14)                     | (0 << 12)                      | (0 << 11)
+                   | (0 << 14)                     | (sub_pel_mode << 12)           | (0 << 11)
                  //| (Search_Ctrl << 8)            | (Ref_Access << 7)              | (SrcAccess << 6)
                    | (0 << 8)                      | (0 << 7)                       | (0 << 6)
                  //| (Mb_Type_Remap << 4)          | (Reserved_Workaround << 3)     | (Reserved_Workaround << 2)
@@ -120,6 +149,15 @@  void block_motion_estimate_intel(accelerator_intel_t accel,
                  //| (Src_Size);
                    | (0);
 
+
+  //src_grf0_dw7 = Debug;
+  src_grf0_dw7 = 0;
+  //src_grf0_dw6 = Debug;
+  src_grf0_dw6 = 0;
+  //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id?);
+  //src_grf0_dw4 = Ignored;
+  src_grf0_dw4 = 0;
+
   //src_grf0_dw2 = (SrcY << 16) | (SrcX);
   src_grf0_dw2 = (srcCoord.y << 16)  | (srcCoord.x);
   //src_grf0_dw1 = (Ref1Y << 16)  | (Ref1X);
@@ -142,7 +180,8 @@  void block_motion_estimate_intel(accelerator_intel_t accel,
   /*src_grf1_dw1 = (RepartEn << 31)                 | (FBPrunEn << 30)               | (AdaptiveValidationControl << 29)
                  | (Uni_Mix_Disable << 28)       | (Bi_Sub_Mb_Part_Mask << 24)    | (Reserverd << 22)
                  | (Bi_Weight << 16)             | (Reserved << 6)                | (MaxNumMVs);*/
-  src_grf1_dw1 = (0 << 24) | (2);
+  //src_grf1_dw1 = (0 << 24) | (2);
+  src_grf1_dw1 = (0 << 24) | (16);
   /*src_grf1_dw0 = (Early_Ime_Stop << 24)           | (Early_Fme_Success << 16)      | (Skip_Success << 8)
                  | (T8x8_Flag_For_Inter_En << 7) | (Quit_Inter_En << 6)           | (Early_Ime_Success_En << 5)
                  | (Early_Success_En << 4)       | (Part_Candidate_En << 3)       | (Bi_Mix_Dis << 2)
@@ -201,6 +240,8 @@  void block_motion_estimate_intel(accelerator_intel_t accel,
   src_grf4_dw1 = 0;
   src_grf4_dw0 = 0;
 
+  int lid_x = get_local_id(0);
+
   vme_result = __gen_ocl_vme(src_image, ref_image,
                 src_grf0_dw7, src_grf0_dw6, src_grf0_dw5, src_grf0_dw4,
                 src_grf0_dw3, src_grf0_dw2, src_grf0_dw1, src_grf0_dw0,
@@ -217,17 +258,109 @@  void block_motion_estimate_intel(accelerator_intel_t accel,
 
   barrier(CLK_LOCAL_MEM_FENCE);
 
-  int lid_x = get_local_id(0);
+  short2 mv[16];
+  ushort res[16];
+
+  uint write_back_dwx;
   uint simd_width = get_sub_group_size();
-  uint write_back_grf1_dw0;
-  if(simd_width == 8)
-    write_back_grf1_dw0 = __gen_ocl_region(0, vme_result.s1);
-  else if(simd_width == 16)
-    write_back_grf1_dw0 = __gen_ocl_region(8, vme_result.s0);
-  short2 val = as_short2( write_back_grf1_dw0 );
-  int index = lgid_y * get_num_groups(0) + lgid_x;
-  if( lid_x == 0 ){
-    motion_vector_buffer[index] = val;
+
+  /* In simd 8 mode, one kernel variable 'uint' map to 8 dword.
+   * In simd 16 mode, one kernel variable 'uint' map to 16 dword.
+   * That's why we should treat simd8 and simd16 differently when
+   * use __gen_ocl_region.
+   * */
+  if(simd_width == 8){
+    write_back_dwx = __gen_ocl_region(0, vme_result.s1);
+    mv[0] = as_short2( write_back_dwx );
+
+    if(accel.mb_block_type > 0x0){
+      for(int i = 2, j = 1; j < 4; i += 2, j++){
+        write_back_dwx = __gen_ocl_region(i, vme_result.s1);
+        mv[j] = as_short2( write_back_dwx );
+      }
+      if(accel.mb_block_type > 0x1){
+        for(int i = 0, j = 4; j < 8; i += 2, j++){
+          write_back_dwx = __gen_ocl_region(i, vme_result.s2);
+          mv[j] = as_short2( write_back_dwx );
+        }
+        for(int i = 0, j = 8; j < 12; i += 2, j++){
+          write_back_dwx = __gen_ocl_region(i, vme_result.s3);
+          mv[j] = as_short2( write_back_dwx );
+        }
+        for(int i = 0, j = 12; j < 16; i += 2, j++){
+          write_back_dwx = __gen_ocl_region(i, vme_result.s4);
+          mv[j] = as_short2( write_back_dwx );
+        }
+      }
+    }
+    ushort2 temp_res;
+    for(int i = 0; i < 8; i++){
+      write_back_dwx = __gen_ocl_region(i, vme_result.s5);
+      temp_res = as_ushort2(write_back_dwx);
+      res[i*2] = temp_res.s0;
+      res[i*2+1] = temp_res.s1;
+    }
+  }
+  else if(simd_width == 16){
+    write_back_dwx = __gen_ocl_region(0 + 8, vme_result.s0);
+    mv[0] = as_short2( write_back_dwx );
+
+    if(accel.mb_block_type > 0x0){
+      for(int i = 2, j = 1; j < 4; i += 2, j++){
+        write_back_dwx = __gen_ocl_region(i + 8, vme_result.s0);
+        mv[j] = as_short2( write_back_dwx );
+      }
+      if(accel.mb_block_type > 0x1){
+        for(int i = 0, j = 4; j < 8; i += 2, j++){
+          write_back_dwx = __gen_ocl_region(i, vme_result.s1);
+          mv[j] = as_short2( write_back_dwx );
+        }
+        for(int i = 0, j = 8; j < 12; i += 2, j++){
+          write_back_dwx = __gen_ocl_region(i + 8, vme_result.s1);
+          mv[j] = as_short2( write_back_dwx );
+        }
+        for(int i = 0, j = 12; j < 16; i += 2, j++){
+          write_back_dwx = __gen_ocl_region(i, vme_result.s2);
+          mv[j] = as_short2( write_back_dwx );
+        }
+      }
+    }
+    ushort2 temp_res;
+    for(int i = 0; i < 8; i++){
+      write_back_dwx = __gen_ocl_region(i + 8, vme_result.s2);
+      temp_res = as_ushort2(write_back_dwx);
+      res[i*2] = temp_res.s0;
+      res[i*2+1] = temp_res.s1;
+    }
+  }
+
+  int mv_index;
+
+  //CL_ME_MB_TYPE_16x16_INTEL
+  if(accel.mb_block_type == 0x0){
+    mv_index = index * 1;
+    if( lid_x == 0 ){
+      motion_vector_buffer[mv_index] = mv[lid_x];
+      residuals[mv_index] = 2 * res[lid_x];
+    }
+  }
+  //CL_ME_MB_TYPE_8x8_INTEL
+  else if(accel.mb_block_type == 0x1){
+    if(lid_x < 4){
+      mv_index = lgid_y * num_groups_x * 4 + lgid_x * 2;
+      mv_index = mv_index + num_groups_x * 2 * (lid_x / 2) + (lid_x % 2);
+      motion_vector_buffer[mv_index] = mv[lid_x];
+      residuals[mv_index] = 2 * res[lid_x];
+    }
+  }
+  //CL_ME_MB_TYPE_4x4_INTEL
+  else if(accel.mb_block_type == 0x2){
+    if(lid_x < 16){
+      mv_index = lgid_y * num_groups_x * 16 + lgid_x * 4;
+      mv_index = mv_index + num_groups_x * 4 * (lid_x / 4) + (lid_x % 4);
+      motion_vector_buffer[mv_index] = mv[lid_x];
+      residuals[mv_index] = 2 * res[lid_x];
+    }
   }
 
 }

Comments

LGTM, pushed, thanks.

> -----Original Message-----

> From: Beignet [mailto:beignet-bounces@lists.freedesktop.org] On Behalf Of

> Chuanbo Weng

> Sent: Wednesday, November 11, 2015 17:22

> To: beignet@lists.freedesktop.org

> Cc: Weng, Chuanbo

> Subject: [Beignet] [PATCH] Full support of cl_intel_motion_estimation

> extension.

> 

> The following items are supported in this commit:

> 1. Return residuals.

> 2. All types of mb_block_type, subpixel_mode, sad_adjust_mode in

>    cl_motion_estimation_desc_intel.

> After this commit, cl_intel_motion_estimation is fully supported.

> 

> Signed-off-by: Chuanbo Weng <chuanbo.weng@intel.com>

> ---

>  docs/howto/video-motion-estimation-howto.mdwn      |  26 +--

>  .../cl_internal_block_motion_estimate_intel.cl     | 199

> +++++++++++++++++----

>  2 files changed, 175 insertions(+), 50 deletions(-)

> 

> diff --git a/docs/howto/video-motion-estimation-howto.mdwn

> b/docs/howto/video-motion-estimation-howto.mdwn

> index d9edc9b..8deaa61 100644

> --- a/docs/howto/video-motion-estimation-howto.mdwn

> +++ b/docs/howto/video-motion-estimation-howto.mdwn

> @@ -1,21 +1,15 @@

>  Video Motion Vector HowTo

>  ==========================

> 

> -Beignet now supports cl_intel_accelerator and part of

> cl_intel_motion_estimation, which -are Khronos official extensions. It

> provides a hardware acceleration of video motion

> +Beignet now supports cl_intel_accelerator and

> +cl_intel_motion_estimation, which are Khronos official extensions. It

> +provides a hardware acceleration of video motion

>  vector to users.

> 

> -Supported hardware platform and limitation

> -------------------------------------------

> +Supported hardware platform

> +---------------------------

> 

> -Only 3rd Generation Intel Core Processors is supported for vme now. And

> now we just -implement this part of cl_intel_motion_estimation for motion

> vector computation(residuals -can not be returned yet) on 3rd Generation

> Intel Core Processors:

> -  mb_block_type = CL_ME_MB_TYPE_16x16_INTEL

> -  subpixel_mode = CL_ME_SUBPIXEL_MODE_INTEGER_INTEL

> -  search_path_type = CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL /

> CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL

> -                     / CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL

> -We will fully support cl_intel_motion_estimation in the future.

> +Only 3rd Generation Intel Core Processors is supported for vme now. We

> +will consider to support more platforms if necessary.

> 

>  Steps

>  -----

> @@ -23,15 +17,13 @@ Steps

>  In order to use video motion estimation provided by Beignet in your

> program, please follow  the steps as below:

> 

> -- Create a cl_accelerator_intel object using extension API

> clCreateAcceleratorINTEL, with

> -  the following parameters:

> +- Create a cl_accelerator_intel object using extension API

> +clCreateAcceleratorINTEL, like

> +  this:

>    _accelerator_type_intel accelerator_type =

> CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL;

>    cl_motion_estimation_desc_intel vmedesc =

> {CL_ME_MB_TYPE_16x16_INTEL,

>                                               CL_ME_SUBPIXEL_MODE_INTEGER_INTEL,

>                                               CL_ME_SAD_ADJUST_MODE_NONE_INTEL,

> -                                             CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL(

> -                                             or CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL

> -                                             or CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL)

> +

> + CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL

>                                              };

> 

>  - Invoke clCreateProgramWithBuiltInKernels to create a program object with

> built-in kernels diff --git

> a/src/kernels/cl_internal_block_motion_estimate_intel.cl

> b/src/kernels/cl_internal_block_motion_estimate_intel.cl

> index 5a22338..1f28f4e 100644

> --- a/src/kernels/cl_internal_block_motion_estimate_intel.cl

> +++ b/src/kernels/cl_internal_block_motion_estimate_intel.cl

> @@ -59,23 +59,28 @@ void

> block_motion_estimate_intel(accelerator_intel_t accel,

>    int lgid_x = get_group_id(0);

>    int lgid_y = get_group_id(1);

> 

> +  int num_groups_x = get_num_groups(0);  int index = lgid_y *

> + num_groups_x + lgid_x;

> +

>    uint2 srcCoord = 0;

> +  short2 predict_mv = 0;

> +  if(prediction_motion_vector_buffer != NULL){

> +    predict_mv = prediction_motion_vector_buffer[index];

> +    predict_mv.x = predict_mv.x / 4;

> +    predict_mv.y = predict_mv.y / 4;

> +  }

> 

>    srcCoord.x = lgid_x * 16;

>    srcCoord.y = lgid_y * 16;

> 

> -  //TODO: This line of code is just to workaround a curbe related bug caused

> by commit 061d214a6fc2876a0e24e094f87f2a172984bc23

> -  //After fix, this line should be removed.

> -  src_grf0_dw5 = accel.mb_block_type;

> -

>    //CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL

>    if(accel.search_path_type == 0x0){

> -    //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8)

> | (Dispatch_Id?);

> +    //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored

> + << 8) | (Dispatch_Id);

>      src_grf0_dw5 =   (20 << 24)         | (20 << 16)        | (0 << 8)       | (0);

>      //src_grf0_dw1 = (Ref1Y << 16)  | (Ref1X);

> -    src_grf0_dw1 =   0xfffefffe;

> +    src_grf0_dw1 =   ((-2 + predict_mv.y) << 16 ) | ((-2 + predict_mv.x) &

> 0x0000ffff);

>      //src_grf0_dw0 = (Ref0Y << 16)  | (Ref0X);

> -    src_grf0_dw0 =   0xfffefffe;

> +    src_grf0_dw0 =   ((-2 + predict_mv.y) << 16 ) | ((-2 + predict_mv.x) &

> 0x0000ffff);

>      //src_grf1_dw2 = (Start1Y << 28)                  | (Start1X << 24)                |

> (Start0Y << 20)

>      src_grf1_dw2 =   (0 << 28)                        | (0 << 24)                      | (0 << 20)

>                     //| (Start0X << 16)               | (Max_Num_SU << 8)              | (LenSP);

> @@ -84,35 +89,59 @@ void

> block_motion_estimate_intel(accelerator_intel_t accel,

>    //CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL

>    else if(accel.search_path_type == 0x1){

>      src_grf0_dw5 =   (24 << 24)         | (24 << 16)        | (0 << 8)       | (0);

> -    src_grf0_dw1 =   0xfffcfffc;

> -    src_grf0_dw0 =   0xfffcfffc;

> +    src_grf0_dw1 =   ((-4 + predict_mv.y) << 16 ) | ((-4 + predict_mv.x) &

> 0x0000ffff);

> +    src_grf0_dw0 =   ((-4 + predict_mv.y) << 16 ) | ((-4 + predict_mv.x) &

> 0x0000ffff);

>      src_grf1_dw2 =   (0 << 28)                        | (0 << 24)                      | (0 << 20)

>                       | (0 << 16)                     | (48 << 8)                       | (48);

>    }

>    //CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL

>    else if(accel.search_path_type == 0x5){

>      src_grf0_dw5 =   (40 << 24)         | (48 << 16)        | (0 << 8)       | (0);

> -    src_grf0_dw1 =   0xfff4fff0;

> -    src_grf0_dw0 =   0xfff4fff0;

> +    src_grf0_dw1 =   ((-12 + predict_mv.y) << 16 ) | ((-16 + predict_mv.x) &

> 0x0000ffff);

> +    src_grf0_dw0 =   ((-12 + predict_mv.y) << 16 ) | ((-16 +  + predict_mv.x) &

> 0x0000ffff);

>      src_grf1_dw2 =   (0 << 28)                        | (0 << 24)                      | (0 << 20)

>                       | (0 << 16)                     | (48 << 8)                       | (48);

>    }

> 

> -  //src_grf0_dw7 = Debug;

> -  src_grf0_dw7 = 0;

> -  //src_grf0_dw6 = Debug;

> -  src_grf0_dw6 = 0;

> -  //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) |

> (Dispatch_Id?);

> -  //src_grf0_dw4 = Ignored;

> -  src_grf0_dw4 = 0;

> -  //src_grf0_dw3 = (Reserved << 31)                 | (Sub_Mb_Part_Mask << 24)

> | (Intra_SAD << 22)

> -  src_grf0_dw3 =   (0 << 31)                 | (0x7e << 24)                   | (0 << 22)

> +  /*Deal with mb_block_type & sad_adjust_mode & subpixel_mode*/

> uchar

> + sub_mb_part_mask = 0;  //CL_ME_MB_TYPE_16x16_INTEL

> + if(accel.mb_block_type == 0x0)

> +    sub_mb_part_mask = 0x7e;

> +  //CL_ME_MB_TYPE_8x8_INTEL

> +  else if(accel.mb_block_type == 0x1)

> +    sub_mb_part_mask = 0x77;

> +  //CL_ME_MB_TYPE_4x4_INTEL

> +  else if(accel.mb_block_type == 0x2)

> +    sub_mb_part_mask = 0x3f;

> +

> +  uchar inter_sad = 0;

> +  //CL_ME_SAD_ADJUST_MODE_NONE_INTEL

> +  if(accel.sad_adjust_mode == 0x0)

> +    inter_sad = 0;

> +  //CL_ME_SAD_ADJUST_MODE_HAAR_INTEL

> +  else if(accel.sad_adjust_mode == 0x1)

> +    inter_sad = 2;

> +

> +  uchar sub_pel_mode = 0;

> +  //CL_ME_SUBPIXEL_MODE_INTEGER_INTEL

> +  if(accel.subpixel_mode == 0x0)

> +    sub_pel_mode = 0;

> +  //CL_ME_SUBPIXEL_MODE_HPEL_INTEL

> +  else if(accel.subpixel_mode == 0x1)

> +    sub_pel_mode = 1;

> +  //CL_ME_SUBPIXEL_MODE_QPEL_INTEL

> +  else if(accel.subpixel_mode == 0x2)

> +    sub_pel_mode = 3;

> +

> +  //src_grf0_dw3 = (Reserved << 31)                | (Sub_Mb_Part_Mask << 24)

> | (Intra_SAD << 22)

> +  src_grf0_dw3 =   (0 << 31)                       | (sub_mb_part_mask << 24)       | (0

> << 22)

>                   //| (Inter_SAD << 20)             | (BB_Skip_Enabled << 19)        |

> (Reserverd << 18)

> -                   | (0 << 20)                     | (0 << 19)                      | (0 << 18)

> +                   | (inter_sad << 20)             | (0 << 19)                      | (0 << 18)

>                   //| (Dis_Aligned_Src_Fetch << 17) | (Dis_Aligned_Ref_Fetch << 16)

> | (Dis_Field_Cache_Alloc << 15)

>                     | (0 << 17)                     | (0 << 16)                      | (0 << 15)

>                   //| (Skip_Type << 14)             | (Sub_Pel_Mode << 12)           |

> (Dual_Search_Path_Opt << 11)

> -                   | (0 << 14)                     | (0 << 12)                      | (0 << 11)

> +                   | (0 << 14)                     | (sub_pel_mode << 12)           | (0 << 11)

>                   //| (Search_Ctrl << 8)            | (Ref_Access << 7)              | (SrcAccess

> << 6)

>                     | (0 << 8)                      | (0 << 7)                       | (0 << 6)

>                   //| (Mb_Type_Remap << 4)          | (Reserved_Workaround << 3)     |

> (Reserved_Workaround << 2)

> @@ -120,6 +149,15 @@ void

> block_motion_estimate_intel(accelerator_intel_t accel,

>                   //| (Src_Size);

>                     | (0);

> 

> +

> +  //src_grf0_dw7 = Debug;

> +  src_grf0_dw7 = 0;

> +  //src_grf0_dw6 = Debug;

> +  src_grf0_dw6 = 0;

> +  //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored <<

> + 8) | (Dispatch_Id?);

> +  //src_grf0_dw4 = Ignored;

> +  src_grf0_dw4 = 0;

> +

>    //src_grf0_dw2 = (SrcY << 16) | (SrcX);

>    src_grf0_dw2 = (srcCoord.y << 16)  | (srcCoord.x);

>    //src_grf0_dw1 = (Ref1Y << 16)  | (Ref1X); @@ -142,7 +180,8 @@ void

> block_motion_estimate_intel(accelerator_intel_t accel,

>    /*src_grf1_dw1 = (RepartEn << 31)                 | (FBPrunEn << 30)               |

> (AdaptiveValidationControl << 29)

>                   | (Uni_Mix_Disable << 28)       | (Bi_Sub_Mb_Part_Mask << 24)    |

> (Reserverd << 22)

>                   | (Bi_Weight << 16)             | (Reserved << 6)                |

> (MaxNumMVs);*/

> -  src_grf1_dw1 = (0 << 24) | (2);

> +  //src_grf1_dw1 = (0 << 24) | (2);

> +  src_grf1_dw1 = (0 << 24) | (16);

>    /*src_grf1_dw0 = (Early_Ime_Stop << 24)           | (Early_Fme_Success << 16)

> | (Skip_Success << 8)

>                   | (T8x8_Flag_For_Inter_En << 7) | (Quit_Inter_En << 6)           |

> (Early_Ime_Success_En << 5)

>                   | (Early_Success_En << 4)       | (Part_Candidate_En << 3)       |

> (Bi_Mix_Dis << 2)

> @@ -201,6 +240,8 @@ void

> block_motion_estimate_intel(accelerator_intel_t accel,

>    src_grf4_dw1 = 0;

>    src_grf4_dw0 = 0;

> 

> +  int lid_x = get_local_id(0);

> +

>    vme_result = __gen_ocl_vme(src_image, ref_image,

>                  src_grf0_dw7, src_grf0_dw6, src_grf0_dw5, src_grf0_dw4,

>                  src_grf0_dw3, src_grf0_dw2, src_grf0_dw1, src_grf0_dw0, @@ -

> 217,17 +258,109 @@ void block_motion_estimate_intel(accelerator_intel_t

> accel,

> 

>    barrier(CLK_LOCAL_MEM_FENCE);

> 

> -  int lid_x = get_local_id(0);

> +  short2 mv[16];

> +  ushort res[16];

> +

> +  uint write_back_dwx;

>    uint simd_width = get_sub_group_size();

> -  uint write_back_grf1_dw0;

> -  if(simd_width == 8)

> -    write_back_grf1_dw0 = __gen_ocl_region(0, vme_result.s1);

> -  else if(simd_width == 16)

> -    write_back_grf1_dw0 = __gen_ocl_region(8, vme_result.s0);

> -  short2 val = as_short2( write_back_grf1_dw0 );

> -  int index = lgid_y * get_num_groups(0) + lgid_x;

> -  if( lid_x == 0 ){

> -    motion_vector_buffer[index] = val;

> +

> +  /* In simd 8 mode, one kernel variable 'uint' map to 8 dword.

> +   * In simd 16 mode, one kernel variable 'uint' map to 16 dword.

> +   * That's why we should treat simd8 and simd16 differently when

> +   * use __gen_ocl_region.

> +   * */

> +  if(simd_width == 8){

> +    write_back_dwx = __gen_ocl_region(0, vme_result.s1);

> +    mv[0] = as_short2( write_back_dwx );

> +

> +    if(accel.mb_block_type > 0x0){

> +      for(int i = 2, j = 1; j < 4; i += 2, j++){

> +        write_back_dwx = __gen_ocl_region(i, vme_result.s1);

> +        mv[j] = as_short2( write_back_dwx );

> +      }

> +      if(accel.mb_block_type > 0x1){

> +        for(int i = 0, j = 4; j < 8; i += 2, j++){

> +          write_back_dwx = __gen_ocl_region(i, vme_result.s2);

> +          mv[j] = as_short2( write_back_dwx );

> +        }

> +        for(int i = 0, j = 8; j < 12; i += 2, j++){

> +          write_back_dwx = __gen_ocl_region(i, vme_result.s3);

> +          mv[j] = as_short2( write_back_dwx );

> +        }

> +        for(int i = 0, j = 12; j < 16; i += 2, j++){

> +          write_back_dwx = __gen_ocl_region(i, vme_result.s4);

> +          mv[j] = as_short2( write_back_dwx );

> +        }

> +      }

> +    }

> +    ushort2 temp_res;

> +    for(int i = 0; i < 8; i++){

> +      write_back_dwx = __gen_ocl_region(i, vme_result.s5);

> +      temp_res = as_ushort2(write_back_dwx);

> +      res[i*2] = temp_res.s0;

> +      res[i*2+1] = temp_res.s1;

> +    }

> +  }

> +  else if(simd_width == 16){

> +    write_back_dwx = __gen_ocl_region(0 + 8, vme_result.s0);

> +    mv[0] = as_short2( write_back_dwx );

> +

> +    if(accel.mb_block_type > 0x0){

> +      for(int i = 2, j = 1; j < 4; i += 2, j++){

> +        write_back_dwx = __gen_ocl_region(i + 8, vme_result.s0);

> +        mv[j] = as_short2( write_back_dwx );

> +      }

> +      if(accel.mb_block_type > 0x1){

> +        for(int i = 0, j = 4; j < 8; i += 2, j++){

> +          write_back_dwx = __gen_ocl_region(i, vme_result.s1);

> +          mv[j] = as_short2( write_back_dwx );

> +        }

> +        for(int i = 0, j = 8; j < 12; i += 2, j++){

> +          write_back_dwx = __gen_ocl_region(i + 8, vme_result.s1);

> +          mv[j] = as_short2( write_back_dwx );

> +        }

> +        for(int i = 0, j = 12; j < 16; i += 2, j++){

> +          write_back_dwx = __gen_ocl_region(i, vme_result.s2);

> +          mv[j] = as_short2( write_back_dwx );

> +        }

> +      }

> +    }

> +    ushort2 temp_res;

> +    for(int i = 0; i < 8; i++){

> +      write_back_dwx = __gen_ocl_region(i + 8, vme_result.s2);

> +      temp_res = as_ushort2(write_back_dwx);

> +      res[i*2] = temp_res.s0;

> +      res[i*2+1] = temp_res.s1;

> +    }

> +  }

> +

> +  int mv_index;

> +

> +  //CL_ME_MB_TYPE_16x16_INTEL

> +  if(accel.mb_block_type == 0x0){

> +    mv_index = index * 1;

> +    if( lid_x == 0 ){

> +      motion_vector_buffer[mv_index] = mv[lid_x];

> +      residuals[mv_index] = 2 * res[lid_x];

> +    }

> +  }

> +  //CL_ME_MB_TYPE_8x8_INTEL

> +  else if(accel.mb_block_type == 0x1){

> +    if(lid_x < 4){

> +      mv_index = lgid_y * num_groups_x * 4 + lgid_x * 2;

> +      mv_index = mv_index + num_groups_x * 2 * (lid_x / 2) + (lid_x % 2);

> +      motion_vector_buffer[mv_index] = mv[lid_x];

> +      residuals[mv_index] = 2 * res[lid_x];

> +    }

> +  }

> +  //CL_ME_MB_TYPE_4x4_INTEL

> +  else if(accel.mb_block_type == 0x2){

> +    if(lid_x < 16){

> +      mv_index = lgid_y * num_groups_x * 16 + lgid_x * 4;

> +      mv_index = mv_index + num_groups_x * 4 * (lid_x / 4) + (lid_x % 4);

> +      motion_vector_buffer[mv_index] = mv[lid_x];

> +      residuals[mv_index] = 2 * res[lid_x];

> +    }

>    }

> 

>  }

> --

> 1.9.1

> 

> _______________________________________________

> Beignet mailing list

> Beignet@lists.freedesktop.org

> http://lists.freedesktop.org/mailman/listinfo/beignet