optimize the copy buffer rect aligned16 and aligned128.

Submitted by Luo, Xionghu on July 24, 2015, 3 a.m.

Details

Message ID 1437706839-8077-1-git-send-email-xionghu.luo@intel.com
State New
Headers show

Not browsing as part of any series.

Commit Message

Luo, Xionghu July 24, 2015, 3 a.m.
From: Luo Xionghu <xionghu.luo@intel.com>

Signed-off-by: Luo Xionghu <xionghu.luo@intel.com>
---
 src/CMakeLists.txt                                |  2 +-
 src/cl_context.h                                  |  2 ++
 src/cl_gt_device.h                                |  3 +++
 src/cl_mem.c                                      | 33 ++++++++++++++++++++++-
 src/kernels/cl_internal_copy_buf_rect_align128.cl | 18 +++++++++++++
 src/kernels/cl_internal_copy_buf_rect_align16.cl  | 18 +++++++++++++
 6 files changed, 74 insertions(+), 2 deletions(-)
 create mode 100644 src/kernels/cl_internal_copy_buf_rect_align128.cl
 create mode 100644 src/kernels/cl_internal_copy_buf_rect_align16.cl

Patch hide | download patch | download mbox

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 4e67c71..1e8c59b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -44,7 +44,7 @@  set (KERNEL_STR_FILES)
 set (KERNEL_NAMES cl_internal_copy_buf_align4
 cl_internal_copy_buf_align16 cl_internal_copy_buf_unalign_same_offset
 cl_internal_copy_buf_unalign_dst_offset cl_internal_copy_buf_unalign_src_offset
-cl_internal_copy_buf_rect cl_internal_copy_buf_rect_align4
+cl_internal_copy_buf_rect cl_internal_copy_buf_rect_align4 cl_internal_copy_buf_rect_align16 cl_internal_copy_buf_rect_align128
 cl_internal_copy_image_1d_to_1d cl_internal_copy_image_2d_to_2d
 cl_internal_copy_image_3d_to_2d cl_internal_copy_image_2d_to_3d cl_internal_copy_image_3d_to_3d
 cl_internal_copy_image_2d_to_2d_array cl_internal_copy_image_1d_array_to_1d_array
diff --git a/src/cl_context.h b/src/cl_context.h
index 249fed8..662d196 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -48,6 +48,8 @@  enum _cl_internal_ker_type {
   CL_ENQUEUE_COPY_BUFFER_UNALIGN_SRC_OFFSET,
   CL_ENQUEUE_COPY_BUFFER_RECT,
   CL_ENQUEUE_COPY_BUFFER_RECT_ALIGN4,
+  CL_ENQUEUE_COPY_BUFFER_RECT_ALIGN16,
+  CL_ENQUEUE_COPY_BUFFER_RECT_ALIGN128,
   CL_ENQUEUE_COPY_IMAGE_1D_TO_1D,             //copy image 1d to image 1d
   CL_ENQUEUE_COPY_IMAGE_2D_TO_2D,             //copy image 2d to image 2d
   CL_ENQUEUE_COPY_IMAGE_3D_TO_2D,             //copy image 3d to image 2d
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
index 4b43c20..1eb4cbd 100644
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -94,6 +94,9 @@  DECL_INFO_STRING(built_in_kernels, "__cl_copy_region_align4;"
                                    "__cl_copy_region_unalign_dst_offset;"
                                    "__cl_copy_region_unalign_src_offset;"
                                    "__cl_copy_buffer_rect;"
+                                   "__cl_copy_buffer_rect_align4;"
+                                   "__cl_copy_buffer_rect_align16;"
+                                   "__cl_copy_buffer_rect_align128;"
                                    "__cl_copy_image_1d_to_1d;"
                                    "__cl_copy_image_2d_to_2d;"
                                    "__cl_copy_image_3d_to_2d;"
diff --git a/src/cl_mem.c b/src/cl_mem.c
index f6aa5b5..37f2733 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -1510,7 +1510,37 @@  cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
 
   /* setup the kernel and run. */
   size_t region0 = region[0];
-  if( (src_offset % 4== 0) && (dst_offset % 4== 0) && (src_row_pitch % 4== 0) && (dst_row_pitch % 4== 0)
+  int aligned128 = 128;
+  int aligned16 = 16;
+  if((src_offset % aligned128 == 0) && (dst_offset % aligned128== 0) && (src_row_pitch % aligned128 == 0) && (dst_row_pitch % aligned128 == 0)
+  && (src_slice_pitch % aligned128 == 0) && (dst_slice_pitch % aligned128 == 0) && (region0 % aligned128 == 0) ){
+    extern char cl_internal_copy_buf_rect_align128_str[];
+    extern size_t cl_internal_copy_buf_rect_align128_str_size;
+    region0 /= aligned128;
+    src_offset /= aligned128;
+    dst_offset /= aligned128;
+    src_row_pitch /= aligned128;
+    dst_row_pitch /= aligned128;
+    src_slice_pitch /= aligned128;
+    dst_slice_pitch /= aligned128;
+    global_sz[0] /= aligned128;
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_RECT_ALIGN128,
+    cl_internal_copy_buf_rect_align128_str, (size_t)cl_internal_copy_buf_rect_align128_str_size, NULL);
+  }else if( (src_offset % aligned16 == 0) && (dst_offset % aligned16== 0) && (src_row_pitch % aligned16 == 0) && (dst_row_pitch % aligned16 == 0)
+  && (src_slice_pitch % aligned16 == 0) && (dst_slice_pitch % aligned16 == 0) && (region0 % aligned16 == 0) ){
+    extern char cl_internal_copy_buf_rect_align16_str[];
+    extern size_t cl_internal_copy_buf_rect_align16_str_size;
+    region0 /= aligned16;
+    src_offset /= aligned16;
+    dst_offset /= aligned16;
+    src_row_pitch /= aligned16;
+    dst_row_pitch /= aligned16;
+    src_slice_pitch /= aligned16;
+    dst_slice_pitch /= aligned16;
+    global_sz[0] /= aligned16;
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_RECT_ALIGN16,
+    cl_internal_copy_buf_rect_align16_str, (size_t)cl_internal_copy_buf_rect_align16_str_size, NULL);
+  }else if( (src_offset % 4== 0) && (dst_offset % 4== 0) && (src_row_pitch % 4== 0) && (dst_row_pitch % 4== 0)
   && (src_slice_pitch % 4== 0) && (dst_slice_pitch % 4== 0) && (region0 % 4 == 0) ){
     extern char cl_internal_copy_buf_rect_align4_str[];
     extern size_t cl_internal_copy_buf_rect_align4_str_size;
@@ -1521,6 +1551,7 @@  cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
     dst_row_pitch /= 4;
     src_slice_pitch /= 4;
     dst_slice_pitch /= 4;
+    global_sz[0] /= 4;
     ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_RECT_ALIGN4,
     cl_internal_copy_buf_rect_align4_str, (size_t)cl_internal_copy_buf_rect_align4_str_size, NULL);
   }else{
diff --git a/src/kernels/cl_internal_copy_buf_rect_align128.cl b/src/kernels/cl_internal_copy_buf_rect_align128.cl
new file mode 100644
index 0000000..d1a3133
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buf_rect_align128.cl
@@ -0,0 +1,18 @@ 
+kernel void __cl_copy_buffer_rect_align128 ( global uint16* src, global uint16* dst,
+                                          unsigned int region0, unsigned int region1, unsigned int region2,
+                                          unsigned int src_offset, unsigned int dst_offset,
+                                          unsigned int src_row_pitch, unsigned int src_slice_pitch,
+                                          unsigned int dst_row_pitch, unsigned int dst_slice_pitch)
+{
+  int i = get_global_id(0)*16;
+  int j = get_global_id(1)*16;
+  int k = get_global_id(2)*16;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  src_offset += k * src_slice_pitch + j * src_row_pitch + i;
+  dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i;
+  #pragma unroll
+  for(int s = 0; s < 16; s++) {
+	  dst[dst_offset+s] = src[src_offset+s];
+  }
+}
diff --git a/src/kernels/cl_internal_copy_buf_rect_align16.cl b/src/kernels/cl_internal_copy_buf_rect_align16.cl
new file mode 100644
index 0000000..fdc7ddf
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buf_rect_align16.cl
@@ -0,0 +1,18 @@ 
+kernel void __cl_copy_buffer_rect_align16 ( global uint4* src, global uint4* dst,
+                                          unsigned int region0, unsigned int region1, unsigned int region2,
+                                          unsigned int src_offset, unsigned int dst_offset,
+                                          unsigned int src_row_pitch, unsigned int src_slice_pitch,
+                                          unsigned int dst_row_pitch, unsigned int dst_slice_pitch)
+{
+  int i = get_global_id(0)*4;
+  int j = get_global_id(1)*4;
+  int k = get_global_id(2)*4;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  src_offset += k * src_slice_pitch + j * src_row_pitch + i;
+  dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i;
+  #pragma unroll
+  for(int s = 0; s < 4; s++) {
+	  dst[dst_offset+s] = src[src_offset+s];
+  }
+}