[03/10] Utest: Add workgroup reduce tests

Submitted by Grigore Lupescu on March 31, 2016, 3:27 p.m.

Details

Message ID 1459438035-31763-1-git-send-email-grigore.lupescu@intel.com
State New
Headers show
Series "Series without cover letter" ( rev: 1 ) in Beignet

Not browsing as part of any series.

Commit Message

Grigore Lupescu March 31, 2016, 3:27 p.m.
From: Grigore Lupescu <grigore.lupescu at intel.com>

Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
---
 kernels/compiler_workgroup_reduce.cl | 181 ++++++++++--
 utests/compiler_workgroup_reduce.cpp | 557 ++++++++++++++++++++++-------------
 2 files changed, 512 insertions(+), 226 deletions(-)

Patch hide | download patch | download mbox

diff --git a/kernels/compiler_workgroup_reduce.cl b/kernels/compiler_workgroup_reduce.cl
index 1fc57b5..34fd371 100644
--- a/kernels/compiler_workgroup_reduce.cl
+++ b/kernels/compiler_workgroup_reduce.cl
@@ -1,40 +1,171 @@ 
-kernel void compiler_workgroup_reduce_min_uniform(uint src, global uint *dst) {
-   uint min_val = work_group_reduce_min(src);
-   dst[get_local_id(0)] = min_val;
+/*
+ * Workgroup reduce add functions
+ */
+kernel void compiler_workgroup_reduce_add_char(global char *src, global char *dst) {
+  char val = src[get_global_id(0)];
+  char sum = work_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
 }
 
-kernel void compiler_workgroup_reduce_min_uint(global uint *src, global uint *dst) {
-   uint val = src[get_local_id(0)];
-   uint min_val = work_group_reduce_min(val);
-   dst[get_local_id(0)] = min_val;
+kernel void compiler_workgroup_reduce_add_uchar(global uchar *src, global uchar *dst) {
+  uchar val = src[get_global_id(0)];
+  uchar sum = work_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
 }
 
-kernel void compiler_workgroup_reduce_max_uint(global uint *src, global uint *dst) {
-   uint val = src[get_local_id(0)];
-   uint max_val = work_group_reduce_max(val);
-   dst[get_local_id(0)] = max_val;
+kernel void compiler_workgroup_reduce_add_short(global short *src, global short *dst) {
+  short val = src[get_global_id(0)];
+  short sum = work_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
 }
 
-kernel void compiler_workgroup_reduce_min_float(global float *src, global float *dst) {
-   float val = src[get_local_id(0)];
-   float min_val = work_group_reduce_min(val);
-   dst[get_local_id(0)] = min_val;
+kernel void compiler_workgroup_reduce_add_ushort(global ushort *src, global ushort *dst) {
+  ushort val = src[get_global_id(0)];
+  ushort sum = work_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
 }
 
-kernel void compiler_workgroup_reduce_max_float(global float *src, global float *dst) {
-   float val = src[get_local_id(0)];
-   float max_val = work_group_reduce_max(val);
-   dst[get_local_id(0)] = max_val;
+kernel void compiler_workgroup_reduce_add_int(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int sum = work_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
 }
 
 kernel void compiler_workgroup_reduce_add_uint(global uint *src, global uint *dst) {
-   uint val = src[get_local_id(0)];
-   uint sum = work_group_reduce_add(val);
-   dst[get_local_id(0)] = sum;
+  uint val = src[get_global_id(0)];
+  uint sum = work_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_add_long(global long *src, global long *dst) {
+  long val = src[get_global_id(0)];
+  long sum = work_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_add_ulong(global ulong *src, global ulong *dst) {
+  ulong val = src[get_global_id(0)];
+  ulong sum = work_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
 }
 
 kernel void compiler_workgroup_reduce_add_float(global float *src, global float *dst) {
-   float val = src[get_local_id(0)];
-   float sum = work_group_reduce_add(val);
-   dst[get_local_id(0)] = sum;
+  float val = src[get_global_id(0)];
+  float sum = work_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+/*
+ * Workgroup reduce max functions
+ */
+kernel void compiler_workgroup_reduce_max_char(global char *src, global char *dst) {
+  char val = src[get_global_id(0)];
+  char sum = work_group_reduce_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_max_uchar(global uchar *src, global uchar *dst) {
+  uchar val = src[get_global_id(0)];
+  uchar sum = work_group_reduce_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_max_short(global short *src, global short *dst) {
+  short val = src[get_global_id(0)];
+  short sum = work_group_reduce_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_max_ushort(global ushort *src, global ushort *dst) {
+  ushort val = src[get_global_id(0)];
+  ushort sum = work_group_reduce_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_max_int(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int sum = work_group_reduce_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_max_uint(global uint *src, global uint *dst) {
+  uint val = src[get_global_id(0)];
+  uint sum = work_group_reduce_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_max_long(global long *src, global long *dst) {
+  long val = src[get_global_id(0)];
+  long sum = work_group_reduce_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_max_ulong(global ulong *src, global ulong *dst) {
+  ulong val = src[get_global_id(0)];
+  ulong sum = work_group_reduce_max(val);
+  dst[get_global_id(0)] = sum;
 }
+
+kernel void compiler_workgroup_reduce_max_float(global float *src, global float *dst) {
+  float val = src[get_global_id(0)];
+  float sum = work_group_reduce_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+/*
+ * Workgroup reduce min functions
+ */
+kernel void compiler_workgroup_reduce_min_char(global char *src, global char *dst) {
+  char val = src[get_global_id(0)];
+  char sum = work_group_reduce_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_min_uchar(global uchar *src, global uchar *dst) {
+  uchar val = src[get_global_id(0)];
+  uchar sum = work_group_reduce_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_min_short(global short *src, global short *dst) {
+  short val = src[get_global_id(0)];
+  short sum = work_group_reduce_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_min_ushort(global ushort *src, global ushort *dst) {
+  ushort val = src[get_global_id(0)];
+  ushort sum = work_group_reduce_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_min_int(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int sum = work_group_reduce_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_min_uint(global uint *src, global uint *dst) {
+  uint val = src[get_global_id(0)];
+  uint sum = work_group_reduce_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_min_long(global long *src, global long *dst) {
+  long val = src[get_global_id(0)];
+  long sum = work_group_reduce_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_min_ulong(global ulong *src, global ulong *dst) {
+  ulong val = src[get_global_id(0)];
+  ulong sum = work_group_reduce_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_workgroup_reduce_min_float(global float *src, global float *dst) {
+  float val = src[get_global_id(0)];
+  float sum = work_group_reduce_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
diff --git a/utests/compiler_workgroup_reduce.cpp b/utests/compiler_workgroup_reduce.cpp
index 4097843..7fced74 100644
--- a/utests/compiler_workgroup_reduce.cpp
+++ b/utests/compiler_workgroup_reduce.cpp
@@ -1,243 +1,398 @@ 
 #include <cstdint>
 #include <cstring>
 #include <iostream>
+#include <cstdlib>
+#include <iomanip>
+#include <algorithm>
+
 #include "utest_helper.hpp"
 
-void compiler_workgroup_reduce_min_uniform(void)
-{
-  const size_t n = 17;
-  uint32_t src = 253;
+using namespace std;
 
-  // Setup kernel and buffers
-  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce", "compiler_workgroup_reduce_min_uniform");
-  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
-  OCL_SET_ARG(0, sizeof(uint32_t), &src);
-  OCL_SET_ARG(1, sizeof(cl_mem), &buf[0]);
-  globals[0] = n;
-  locals[0] = n;
+/* NDRANGE */
+#define WG_GLOBAL_SIZE 64
+#define WG_LOCAL_SIZE 32
 
-  // Run the kernel on GPU
-  OCL_NDRANGE(1);
-
-  // Compare
-  OCL_MAP_BUFFER(0);
-  for (int32_t i = 0; i < (int32_t) n; ++i) {
-    //printf("%u ", ((uint32_t *)buf_data[0])[i]);
-    OCL_ASSERT(((uint32_t *)buf_data[0])[i] == 253);
+enum WG_FUNCTION
+{
+  WG_REDUCE_ADD,
+  WG_REDUCE_MIN,
+  WG_REDUCE_MAX
+};
+
+/*
+ * Generic compute-expected on CPU function for any workgroup type
+ * and any variable type
+ */
+template<class T>
+static void compute_expected(WG_FUNCTION wg_func,
+                    T* input,
+                    T* expected)
+{
+  if(wg_func == WG_REDUCE_ADD)
+  {
+    T wg_sum = input[0];
+    for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++)
+      wg_sum += input[i];
+    for(uint32_t i = 0; i < WG_LOCAL_SIZE; i++)
+      expected[i] = wg_sum;
+  }
+  else if(wg_func == WG_REDUCE_MAX)
+  {
+    T wg_max = input[0];
+    for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++)
+      wg_max = max(input[i], wg_max);
+    for(uint32_t i = 0; i < WG_LOCAL_SIZE; i++)
+      expected[i] = wg_max;
+  }
+  else if(wg_func == WG_REDUCE_MIN)
+  {
+    T wg_min = input[0];
+    for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++)
+      wg_min = min(input[i], wg_min);
+    for(uint32_t i = 0; i < WG_LOCAL_SIZE; i++)
+      expected[i] = wg_min;
   }
-  OCL_UNMAP_BUFFER(0);
 }
 
-MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_min_uniform);
-
-static uint32_t test_array_uint[64] = {23, 34, 16, 91, 25, 133, 7787, 134, 987, 9853, 33, 21, 865, 1441, 9083, 812,
-                                  10, 43435, 63, 445, 253, 65, 24, 30, 76, 989, 120 ,113 ,133, 41, 18, 91,
-                                  8321, 6712, 881, 911, 5, 788, 8991, 88, 19, 1110, 1231, 1341, 1983, 1983, 91, 212,
-                                  712, 31, 881, 963, 6801, 651, 9810, 77, 98, 5, 16, 1888, 141, 1613, 1771, 16};
-
-void compiler_workgroup_reduce_min_uint(void)
+/*
+ * Generic input-expected generate function for any workgroup type
+ * and any variable type
+ */
+template<class T>
+static void generate_data(WG_FUNCTION wg_func,
+                   T* &input,
+                   T* &expected)
 {
-  const size_t n = 60;
-  uint32_t* src = test_array_uint;
-
-  // Setup kernel and buffers
-  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce", "compiler_workgroup_reduce_min_uint");
-  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
-  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
-  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
-  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
-  globals[0] = n;
-  locals[0] = n;
-
-  OCL_MAP_BUFFER(0);
-  memcpy(buf_data[0], src, n * sizeof(uint32_t));
-  OCL_UNMAP_BUFFER(0);
-
-  // Run the kernel on GPU
-  OCL_NDRANGE(1);
-
-  // Compare
-  OCL_MAP_BUFFER(1);
-  for (int32_t i = 0; i < (int32_t) n; ++i) {
-    //printf("%u ", ((uint32_t *)buf_data[1])[i]);
-    OCL_ASSERT(((uint32_t *)buf_data[1])[i] == 5);
+  input = new T[WG_GLOBAL_SIZE];
+  expected = new T[WG_GLOBAL_SIZE];
+
+  /* base value for all datatypes */
+  T base_val = (long)7 << (sizeof(T) * 5 - 3);
+
+  /* seed for random inputs */
+  srand (time(NULL));
+
+  /* generate inputs and expected values */
+  for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE; gid += WG_LOCAL_SIZE)
+  {
+    /* input values */
+    cout << endl << "IN: " << endl;
+    for(uint32_t lid = 0; lid < WG_LOCAL_SIZE; lid++){
+      input[gid + lid] = (rand() % 2 - 1) * base_val + (rand() % 112);
+      cout << setw(4) << input[gid + lid] << ", " ;
+      if((lid + 1) % 8 == 0)
+        cout << endl;
+    }
+
+    /* expected values */
+    cout << endl << "EXP: " << endl;
+    compute_expected(wg_func, input + gid, expected + gid);
+    for(uint32_t lid = 0; lid < WG_LOCAL_SIZE; lid++){
+      cout << setw(4) << expected[gid + lid] << ", " ;
+      if((lid + 1) % 8 == 0)
+        cout << endl;
+    }
   }
-  OCL_UNMAP_BUFFER(1);
 }
-MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_min_uint);
 
-void compiler_workgroup_reduce_max_uint(void)
+/*
+ * Generic workgroup utest function for any workgroup type
+ * and any variable type
+ */
+template<class T>
+static void workgroup_generic(WG_FUNCTION wg_func,
+                       T* input,
+                       T* expected)
 {
-  const size_t n = 60;
-  uint32_t* src = test_array_uint;
+  /* input and expected data */
+  generate_data(wg_func, input, expected);
 
-  // Setup kernel and buffers
-  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce", "compiler_workgroup_reduce_max_uint");
-  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
-  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  /* prepare input for datatype */
+  OCL_CREATE_BUFFER(buf[0], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
   OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
-  globals[0] = n;
-  locals[0] = n;
 
+  /* set input data for GPU */
   OCL_MAP_BUFFER(0);
-  memcpy(buf_data[0], src, n * sizeof(uint32_t));
+  memcpy(buf_data[0], input, WG_GLOBAL_SIZE * sizeof(T));
   OCL_UNMAP_BUFFER(0);
 
-  // Run the kernel on GPU
+  /* run the kernel on GPU */
+  globals[0] = WG_GLOBAL_SIZE;
+  locals[0] = WG_LOCAL_SIZE;
   OCL_NDRANGE(1);
 
-  // Compare
+  /* check if mistmatch */
   OCL_MAP_BUFFER(1);
-  for (int32_t i = 0; i < (int32_t) n; ++i) {
-    //printf("%u ", ((uint32_t *)buf_data[1])[i]);
-    OCL_ASSERT(((uint32_t *)buf_data[1])[i] == 43435);
-  }
+  uint32_t mistmatches = 0;
+  cout << endl << endl << "CHECK" << endl;
+  for (uint32_t i = 0; i < WG_GLOBAL_SIZE; i++)
+    if(((T *)buf_data[1])[i] != *(expected + i)){
+      cout << "Err at " << i << ", " <<
+        ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl;
+      mistmatches++;
+    }
+  cout << "MISTMATCHES " << mistmatches << endl;
+
+  cout << std::flush;
   OCL_UNMAP_BUFFER(1);
+
+  OCL_ASSERT(mistmatches == 0);
 }
-MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_max_uint);
 
+/*
+ * Workgroup reduce add utest functions
+ */
+void compiler_workgroup_reduce_add_char(void)
+{
+  cl_char *input = NULL;
+  cl_char *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_add_char");
+  workgroup_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_add_char);
+void compiler_workgroup_reduce_add_uchar(void)
+{
+  cl_uchar *input = NULL;
+  cl_uchar *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_add_uchar");
+  workgroup_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_add_uchar);
+void compiler_workgroup_reduce_add_short(void)
+{
+  cl_short *input = NULL;
+  cl_short *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_add_short");
+  workgroup_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_add_short);
+void compiler_workgroup_reduce_add_ushort(void)
+{
+  cl_ushort *input = NULL;
+  cl_ushort *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_add_ushort");
+  workgroup_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_add_ushort);
+void compiler_workgroup_reduce_add_int(void)
+{
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_add_int");
+  workgroup_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_add_int);
 void compiler_workgroup_reduce_add_uint(void)
 {
-  const size_t n = 50;
-  uint32_t* src = test_array_uint;
-
-  // Setup kernel and buffers
-  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce", "compiler_workgroup_reduce_add_uint");
-  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
-  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
-  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
-  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
-  globals[0] = n;
-  locals[0] = n;
-
-  uint32_t cpu_res = 0;
-  for (size_t i = 0; i < n; i++)
-    cpu_res += src[i];
-
-  OCL_MAP_BUFFER(0);
-  memcpy(buf_data[0], src, n * sizeof(uint32_t));
-  OCL_UNMAP_BUFFER(0);
-
-  // Run the kernel on GPU
-  OCL_NDRANGE(1);
-
-  // Compare
-  OCL_MAP_BUFFER(1);
-  for (int32_t i = 0; i < (int32_t) n; ++i) {
-    //printf("%u ", ((uint32_t *)buf_data[1])[i]);
-    OCL_ASSERT(((uint32_t *)buf_data[1])[i] == cpu_res);
-  }
-  OCL_UNMAP_BUFFER(1);
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_add_uint");
+  workgroup_generic(WG_REDUCE_ADD, input, expected);
 }
 MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_add_uint);
-
-static float test_array_float[64] =
-  {1.0234f, 0.34e32f, -13441.4334f, 1893.21f, -9999.0f, -88.00f, 1.3f, 1.0f,
-   2.33f, 134.44f, 263.0f, 1.0f, 0.0f, 344.900043f, 0.1e30f, 1.0e10f,
-
-   10.0f, 43.435f, 6.3f, 44.545f, 0.253f, 6.5f, 0.24f, 10.30f,
-   1312.76f, -0.00989f, 124213.120f, 1.13f, 1.33f, 4.1f, 1.8f, 3234.91f,
-
-   3.21e38f, 6.712f, 0.881f, 12.91f, 5.0f, 7.88f, 128991.0f, 8.8f,
-   0.0019f, -0.1110f, 12.0e31f, -3.3E38f, 1.983f, 1.983f, 10091.0f, 2.12f,
-
-   0.88712, 1e31f, -881.0f, -196e3f, 68.01f, -651.121f, 9.810f, -0.77f,
-   100.98f, 50.0f, 1000.16f, -18e18f, 0.141f, 1613.0f, 1.771f, -16.13f};
-
-void compiler_workgroup_reduce_min_float(void)
+void compiler_workgroup_reduce_add_long(void)
 {
-  const size_t n = 60;
-  float* src = test_array_float;
-
-  // Setup kernel and buffers
-  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce", "compiler_workgroup_reduce_min_float");
-  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
-  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
-  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
-  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
-  globals[0] = n;
-  locals[0] = n;
-
-  OCL_MAP_BUFFER(0);
-  memcpy(buf_data[0], src, n * sizeof(float));
-  OCL_UNMAP_BUFFER(0);
-
-  // Run the kernel on GPU
-  OCL_NDRANGE(1);
-
-  // Compare
-  OCL_MAP_BUFFER(1);
-  for (int32_t i = 0; i < (int32_t) n; ++i) {
-    //printf("%f ", ((float *)buf_data[1])[i]);
-    OCL_ASSERT(((float *)buf_data[1])[i] == -3.3E38f);
-  }
-  OCL_UNMAP_BUFFER(1);
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_add_long");
+  workgroup_generic(WG_REDUCE_ADD, input, expected);
 }
-MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_min_float);
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_add_long);
+void compiler_workgroup_reduce_add_ulong(void)
+{
+  cl_ulong *input = NULL;
+  cl_ulong *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_add_ulong");
+  workgroup_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_add_ulong);
+void compiler_workgroup_reduce_add_float(void)
+{
+  cl_float *input = NULL;
+  cl_float *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_add_float");
+  workgroup_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_add_float);
 
+/*
+ * Workgroup reduce max utest functions
+ */
+void compiler_workgroup_reduce_max_char(void)
+{
+  cl_char *input = NULL;
+  cl_char *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_max_char");
+  workgroup_generic(WG_REDUCE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_max_char);
+void compiler_workgroup_reduce_max_uchar(void)
+{
+  cl_uchar *input = NULL;
+  cl_uchar *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_max_uchar");
+  workgroup_generic(WG_REDUCE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_max_uchar);
+void compiler_workgroup_reduce_max_short(void)
+{
+  cl_short *input = NULL;
+  cl_short *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_max_short");
+  workgroup_generic(WG_REDUCE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_max_short);
+void compiler_workgroup_reduce_max_ushort(void)
+{
+  cl_ushort *input = NULL;
+  cl_ushort *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_max_ushort");
+  workgroup_generic(WG_REDUCE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_max_ushort);
+void compiler_workgroup_reduce_max_int(void)
+{
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_max_int");
+  workgroup_generic(WG_REDUCE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_max_int);
+void compiler_workgroup_reduce_max_uint(void)
+{
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_max_uint");
+  workgroup_generic(WG_REDUCE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_max_uint);
+void compiler_workgroup_reduce_max_long(void)
+{
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_max_long");
+  workgroup_generic(WG_REDUCE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_max_long);
+void compiler_workgroup_reduce_max_ulong(void)
+{
+  cl_ulong *input = NULL;
+  cl_ulong *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_max_ulong");
+  workgroup_generic(WG_REDUCE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_max_ulong);
 void compiler_workgroup_reduce_max_float(void)
 {
-  const size_t n = 60;
-  float* src = test_array_float;
-
-  // Setup kernel and buffers
-  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce", "compiler_workgroup_reduce_max_float");
-  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
-  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
-  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
-  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
-  globals[0] = n;
-  locals[0] = n;
-
-  OCL_MAP_BUFFER(0);
-  memcpy(buf_data[0], src, n * sizeof(float));
-  OCL_UNMAP_BUFFER(0);
-
-  // Run the kernel on GPU
-  OCL_NDRANGE(1);
-
-  // Compare
-  OCL_MAP_BUFFER(1);
-  for (int32_t i = 0; i < (int32_t) n; ++i) {
-    //printf("%f ", ((float *)buf_data[1])[i]);
-    OCL_ASSERT(((float *)buf_data[1])[i] == 3.21e38f);
-  }
-  OCL_UNMAP_BUFFER(1);
+  cl_float *input = NULL;
+  cl_float *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_max_float");
+  workgroup_generic(WG_REDUCE_MAX, input, expected);
 }
 MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_max_float);
 
-void compiler_workgroup_reduce_add_float(void)
+/*
+ * Workgroup reduce min utest functions
+ */
+void compiler_workgroup_reduce_min_char(void)
 {
-  const size_t n = 42;
-  float* src = test_array_float;
-
-  // Setup kernel and buffers
-  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce", "compiler_workgroup_reduce_add_float");
-  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
-  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
-  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
-  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
-  globals[0] = n;
-  locals[0] = n;
-
-  float cpu_res = 0;
-  for (size_t i = 0; i < n; i++)
-    cpu_res += src[i];
-
-  OCL_MAP_BUFFER(0);
-  memcpy(buf_data[0], src, n * sizeof(float));
-  OCL_UNMAP_BUFFER(0);
-
-  // Run the kernel on GPU
-  OCL_NDRANGE(1);
-
-  // Compare
-  OCL_MAP_BUFFER(1);
-  for (int32_t i = 0; i < (int32_t) n; ++i) {
-    //printf("%f ", ((float *)buf_data[1])[i]);
-    OCL_ASSERT(((float *)buf_data[1])[i] == cpu_res);
-  }
-  OCL_UNMAP_BUFFER(1);
+  cl_char *input = NULL;
+  cl_char *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_min_char");
+  workgroup_generic(WG_REDUCE_MIN, input, expected);
 }
-MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_add_float);
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_min_char);
+void compiler_workgroup_reduce_min_uchar(void)
+{
+  cl_uchar *input = NULL;
+  cl_uchar *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_min_uchar");
+  workgroup_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_min_uchar);
+void compiler_workgroup_reduce_min_short(void)
+{
+  cl_short *input = NULL;
+  cl_short *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_min_short");
+  workgroup_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_min_short);
+void compiler_workgroup_reduce_min_ushort(void)
+{
+  cl_ushort *input = NULL;
+  cl_ushort *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_min_ushort");
+  workgroup_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_min_ushort);
+void compiler_workgroup_reduce_min_int(void)
+{
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_min_int");
+  workgroup_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_min_int);
+void compiler_workgroup_reduce_min_uint(void)
+{
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_min_uint");
+  workgroup_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_min_uint);
+void compiler_workgroup_reduce_min_long(void)
+{
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_min_long");
+  workgroup_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_min_long);
+void compiler_workgroup_reduce_min_ulong(void)
+{
+  cl_ulong *input = NULL;
+  cl_ulong *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_min_ulong");
+  workgroup_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_min_ulong);
+void compiler_workgroup_reduce_min_float(void)
+{
+  cl_float *input = NULL;
+  cl_float *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
+                              "compiler_workgroup_reduce_min_float");
+  workgroup_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_workgroup_reduce_min_float);