benchmark test for global read and write bandwidth

Submitted by Meng, Mengmeng on Aug. 15, 2015, 2:48 p.m.

Details

Message ID 1439650081-8240-1-git-send-email-mengmeng.meng@intel.com
State New
Headers show

Not browsing as part of any series.

Commit Message

Meng, Mengmeng Aug. 15, 2015, 2:48 p.m.
---
 benchmark/CMakeLists.txt        |  3 ++-
 benchmark/benchmark_io_test.cpp | 48 +++++++++++++++++++++++++++++++++++++++++
 kernels/benchmark_io.cl         | 26 ++++++++++++++++++++++
 3 files changed, 76 insertions(+), 1 deletion(-)
 create mode 100644 benchmark/benchmark_io_test.cpp
 create mode 100644 kernels/benchmark_io.cl

Patch hide | download patch | download mbox

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 3e43a21..1b9fe54 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -16,7 +16,8 @@  set (benchmark_sources
   benchmark_read_buffer.cpp
   benchmark_read_image.cpp
   benchmark_copy_buffer_to_image.cpp
-  benchmark_copy_image_to_buffer.cpp)
+  benchmark_copy_image_to_buffer.cpp
+  benchmark_io_test)
 
 
 SET(CMAKE_CXX_FLAGS "-DBUILD_BENCHMARK ${CMAKE_CXX_FLAGS}")
diff --git a/benchmark/benchmark_io_test.cpp b/benchmark/benchmark_io_test.cpp
new file mode 100644
index 0000000..5c95ce3
--- /dev/null
+++ b/benchmark/benchmark_io_test.cpp
@@ -0,0 +1,48 @@ 
+#include "utests/utest_helper.hpp"
+#include <sys/time.h>
+
+struct timeval start,stop;
+const size_t n = 1024 * 1024;
+int count = 16;
+const size_t sz = 4 * n * count;
+
+#define BENCH_address(V,T) \
+static double benchmark ##V(void) \
+{ \
+ \
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_ONLY, sz * sizeof(float), NULL); \
+  OCL_CREATE_BUFFER(buf[1], CL_MEM_READ_ONLY, sz * sizeof(float), NULL); \
+ \
+  OCL_CREATE_KERNEL_FROM_FILE("benchmark_io",T); \
+ \
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);\
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]); \
+ \
+  OCL_MAP_BUFFER(0); \
+  for (size_t i = 0; i < sz; i ++) { \
+    ((float *)(buf_data[0]))[i] = rand(); \
+  } \
+  OCL_UNMAP_BUFFER(0);\
+ \
+  globals[0] = n; \
+  locals[0] = 256; \
+ \
+  gettimeofday(&start,0); \
+  for (size_t i=0; i<100; i++) { \
+    OCL_NDRANGE(1);\
+  } \
+  OCL_FINISH();\
+  gettimeofday(&stop,0);\
+ \
+  clReleaseMemObject(buf[0]);\
+  free(buf_data[0]);\
+  buf_data[0] = NULL; \
+\
+  double elapsed = time_subtract(&stop, &start, 0);\
+\
+  return BANDWIDTH(sz * sizeof(float) * 1 * 100, elapsed);\
+} \
+MAKE_BENCHMARK_FROM_FUNCTION_KEEP_PROGRAM(benchmark ##V,true);
+
+BENCH_address(_global_write,"benchmark_global_write")
+BENCH_address(_global_read,"benchmark_global_read")
diff --git a/kernels/benchmark_io.cl b/kernels/benchmark_io.cl
new file mode 100644
index 0000000..6a3150b
--- /dev/null
+++ b/kernels/benchmark_io.cl
@@ -0,0 +1,26 @@ 
+#define COUNT 100
+__kernel void
+benchmark_global_write(__global float * src,  __global float* dst)
+{
+  float sum = 0 ;
+  int id = (int)get_global_id(0);
+
+  if (id%10 == 1)
+    dst[id] = src[id]/2 + 1;
+  else
+    dst[id] = src[id]/2 - 1;
+}
+__kernel void
+benchmark_global_read(__global float * src,  __global float* dst)
+{
+  float sum = 0 ;
+  int id = (int)get_global_id(0);
+
+  for (int i=1; i<COUNT; i++) {
+    sum = sum + src[id%i];
+  }
+
+  if (id%10 == 1)
+    dst[id] = sum;
+}
+

Comments

Some comments, thanks.

> -----Original Message-----

> From: Beignet [mailto:beignet-bounces@lists.freedesktop.org] On Behalf Of

> Meng Mengmeng

> Sent: Saturday, August 15, 2015 22:48

> To: beignet@lists.freedesktop.org

> Cc: Meng, Mengmeng

> Subject: [Beignet] [PATCH] benchmark test for global read and write

> bandwidth

> 

> ---

>  benchmark/CMakeLists.txt        |  3 ++-

>  benchmark/benchmark_io_test.cpp | 48

> +++++++++++++++++++++++++++++++++++++++++

>  kernels/benchmark_io.cl         | 26 ++++++++++++++++++++++

>  3 files changed, 76 insertions(+), 1 deletion(-)  create mode 100644

> benchmark/benchmark_io_test.cpp  create mode 100644

> kernels/benchmark_io.cl

> 

> diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index

> 3e43a21..1b9fe54 100644

> --- a/benchmark/CMakeLists.txt

> +++ b/benchmark/CMakeLists.txt

> @@ -16,7 +16,8 @@ set (benchmark_sources

>    benchmark_read_buffer.cpp

>    benchmark_read_image.cpp

>    benchmark_copy_buffer_to_image.cpp

> -  benchmark_copy_image_to_buffer.cpp)

> +  benchmark_copy_image_to_buffer.cpp

> +  benchmark_io_test)

> 

> 

>  SET(CMAKE_CXX_FLAGS "-DBUILD_BENCHMARK ${CMAKE_CXX_FLAGS}")

> diff --git a/benchmark/benchmark_io_test.cpp

> b/benchmark/benchmark_io_test.cpp new file mode 100644 index

> 0000000..5c95ce3

> --- /dev/null

> +++ b/benchmark/benchmark_io_test.cpp

> @@ -0,0 +1,48 @@

> +#include "utests/utest_helper.hpp"

> +#include <sys/time.h>

> +

> +struct timeval start,stop;

> +const size_t n = 1024 * 1024;

> +int count = 16;

> +const size_t sz = 4 * n * count;

Why size is 4 * n * count?

> +

> +#define BENCH_address(V,T) \

> +static double benchmark ##V(void) \

> +{ \

> + \

> +  OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_ONLY, sz * sizeof(float),

> +NULL); \ 

CPU also access buf[0], so the cl_mem_flags is not CL_MEM_READ_ONLY, right?

> +  OCL_CREATE_BUFFER(buf[1], CL_MEM_READ_ONLY, sz * sizeof(float),

> +NULL); \  \

CL_MEM_WRITE_ONLY?

> +  OCL_CREATE_KERNEL_FROM_FILE("benchmark_io",T); \  \

> +  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);\

> +  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]); \  \

> +  OCL_MAP_BUFFER(0); \

> +  for (size_t i = 0; i < sz; i ++) { \

> +    ((float *)(buf_data[0]))[i] = rand(); \

> +  } \

> +  OCL_UNMAP_BUFFER(0);\

> + \

> +  globals[0] = n; \

> +  locals[0] = 256; \

> + \

> +  gettimeofday(&start,0); \

> +  for (size_t i=0; i<100; i++) { \

> +    OCL_NDRANGE(1);\

> +  } \

> +  OCL_FINISH();\

> +  gettimeofday(&stop,0);\

> + \

> +  clReleaseMemObject(buf[0]);\

> +  free(buf_data[0]);\

> +  buf_data[0] = NULL; \

Also need release buf[1]

> +\

> +  double elapsed = time_subtract(&stop, &start, 0);\ \

> +  return BANDWIDTH(sz * sizeof(float) * 1 * 100, elapsed);\ } \

> +MAKE_BENCHMARK_FROM_FUNCTION_KEEP_PROGRAM(benchmark

> ##V,true);

> +

> +BENCH_address(_global_write,"benchmark_global_write")

> +BENCH_address(_global_read,"benchmark_global_read")

> diff --git a/kernels/benchmark_io.cl b/kernels/benchmark_io.cl new file

> mode 100644 index 0000000..6a3150b

> --- /dev/null

> +++ b/kernels/benchmark_io.cl

> @@ -0,0 +1,26 @@

> +#define COUNT 100

> +__kernel void

> +benchmark_global_write(__global float * src,  __global float* dst) {

> +  float sum = 0 ;

> +  int id = (int)get_global_id(0);

> +

> +  if (id%10 == 1)

> +    dst[id] = src[id]/2 + 1;

> +  else

> +    dst[id] = src[id]/2 - 1;

One write with one read, it is not only write performance.

> +}

> +__kernel void

> +benchmark_global_read(__global float * src,  __global float* dst) {

> +  float sum = 0 ;

> +  int id = (int)get_global_id(0);

> +

> +  for (int i=1; i<COUNT; i++) {

> +    sum = sum + src[id%i];

Only read 0~i-1 memory, there are lots cache hit, so I'm afraid  it is not the realistic read performance, It is better to read different  memory.

> +  }

> +

> +  if (id%10 == 1)

> +    dst[id] = sum;

> +}

> +

> --

> 1.9.1

> 

> _______________________________________________

> Beignet mailing list

> Beignet@lists.freedesktop.org

> http://lists.freedesktop.org/mailman/listinfo/beignet