[6/6] Implement printf for new runtime.

Submitted by junyan.he@inbox.com on June 23, 2017, 10:18 a.m.

Details

Message ID 1498213109-10042-6-git-send-email-junyan.he@inbox.com
State New
Series "Series without cover letter"
Headers show

Commit Message

junyan.he@inbox.com June 23, 2017, 10:18 a.m.
From: Junyan He <junyan.he@intel.com>

We will store printf statement in ELF file and output its log
when the ND_Range finished.

Signed-off-by: Junyan He <junyan.he@intel.com>
---
 backend/src/backend/gen_program_elf.cpp |  47 ++-
 backend/src/backend/program.hpp         |   6 +-
 backend/src/ir/printf.hpp               |  37 +-
 backend/src/llvm/llvm_printf_parser.cpp |  16 +-
 runtime/gen/CMakeLists.txt              |   1 +
 runtime/gen/cl_command_queue_gen.c      |  85 ++++-
 runtime/gen/cl_gen.h                    |   6 +
 runtime/gen/cl_kernel_gen.c             | 131 ++++---
 runtime/gen/cl_printf_gen.c             | 633 ++++++++++++++++++++++++++++++++
 9 files changed, 907 insertions(+), 55 deletions(-)
 create mode 100644 runtime/gen/cl_printf_gen.c

Patch hide | download patch | download mbox

diff --git a/backend/src/backend/gen_program_elf.cpp b/backend/src/backend/gen_program_elf.cpp
index 566ee10..304f491 100644
--- a/backend/src/backend/gen_program_elf.cpp
+++ b/backend/src/backend/gen_program_elf.cpp
@@ -214,14 +214,30 @@  using namespace ELFIO;
 /* The format for Compiler info is:
  -------------------------------
  | GEN_NOTE_TYPE_COMPILER_INFO |
- ----------------------------------------
+ --------------------------------------
  | Compiler name (GBE_Compiler  e.g.) |
- ----------------------------------------
+ --------------------------------------
  | LLVM version major:4 |
  ------------------------
  | LLVM version minor:4 |
  ------------------------ */
 
+/* The format for printf is:
+ ---------------------------
+ | GEN_NOTE_TYPE_CL_PRINTF |
+ ---------------------------
+ | The Kernel name |
+ -------------------------------
+ | CL printf bti:4 |
+ ----------------------
+ | CL printf number:4 |
+ -------------------------------------------
+ | CL printf id for one printf statement:4 |
+ -------------------------------------------
+ | printf format string |
+ ------------------------
+ */
+
 class GenProgramElfContext
 {
 public:
@@ -232,6 +248,7 @@  public:
     GEN_NOTE_TYPE_CL_INFO = 4,
     GEN_NOTE_TYPE_CL_DEVICE_ENQUEUE_INFO = 5,
     GEN_NOTE_TYPE_COMPILER_INFO = 6,
+    GEN_NOTE_TYPE_CL_PRINTF = 7,
   };
 
   struct KernelInfoHelper {
@@ -394,6 +411,32 @@  void GenProgramElfContext::emitOneKernelCLInfo(GenKernel &kernel)
   uint32_t wg_sz_size = 0;
   uint32_t arg_info_size = 0;
 
+  /* Add printf info for this kernel */
+  if (kernel.getPrintfNum() != 0) {
+    std::map<uint32_t, std::string> all_printf;
+    uint32_t printf_n = kernel.collectPrintfStr(all_printf);
+    assert(printf_n == kernel.getPrintfNum());
+    std::ostringstream oss;
+    size_t sz = 0;
+
+    uint32_t bti = kernel.getPrintfBufBTI();
+    oss.write((char *)(&bti), sizeof(uint32_t));
+    sz += sizeof(uint32_t);
+    oss.write((char *)(&printf_n), sizeof(uint32_t));
+    sz += sizeof(uint32_t);
+
+    for (auto iter = all_printf.begin(); iter != all_printf.end(); iter++) {
+      uint32_t id = iter->first;
+      oss.write((char *)(&id), sizeof(uint32_t));
+      sz += sizeof(uint32_t);
+      oss.write(iter->second.c_str(), strlen(iter->second.c_str()) + 1);
+      sz += strlen(iter->second.c_str()) + 1;
+    }
+
+    this->cl_note_writer->add_note(GenProgramElfContext::GEN_NOTE_TYPE_CL_PRINTF,
+                                   kernel.getName(), oss.str().c_str(), sz);
+  }
+
   if ((kernel.getFunctionAttributes())[0] != 0)
     attr_size = ::strlen(kernel.getFunctionAttributes()) + 1;
   all_str_len = ALIGN(attr_size, 4);
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index b2ab3f2..822057f 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -168,7 +168,11 @@  namespace gbe {
     uint32_t getPrintfNum() const {
       return printfSet ? printfSet->getPrintfNum() : 0;
     }
-
+    uint32_t collectPrintfStr(std::map<uint32_t, std::string>& all_printf) const {
+      if (printfSet)
+        return printfSet->collectPrintfStr(all_printf);
+      return 0;
+    }
     void * dupPrintfSet() const {
       void* ptr = printfSet ? (void *)(new ir::PrintfSet(*printfSet)) : NULL;
       return ptr;
diff --git a/backend/src/ir/printf.hpp b/backend/src/ir/printf.hpp
index 728aa68..28944c7 100644
--- a/backend/src/ir/printf.hpp
+++ b/backend/src/ir/printf.hpp
@@ -123,7 +123,7 @@  namespace gbe
         type = PRINTF_SLOT_TYPE_STRING;
       }
 
-      PrintfSlot(PrintfState& st) {
+      PrintfSlot(PrintfState& st, std::string& s) : str(s) {
         type = PRINTF_SLOT_TYPE_STATE;
         state = st;
       }
@@ -135,6 +135,7 @@  namespace gbe
         } else if (other.type == PRINTF_SLOT_TYPE_STATE) {
           type = PRINTF_SLOT_TYPE_STATE;
           state = other.state;
+          str = other.str;
         } else {
           type = PRINTF_SLOT_TYPE_NONE;
         }
@@ -245,6 +246,40 @@  namespace gbe
 
       void outputPrintf(void* buf_addr);
 
+      uint32_t collectPrintfStr(std::map<uint32_t, std::string>& all_printf) const {
+        uint32_t n = 0;
+        for (auto iter = fmts.begin(); iter != fmts.end(); iter++) {
+          std::string s;
+          const PrintfFmt& fmt = iter->second;
+          for (auto &m : fmt) {
+            if (m.type == PRINTF_SLOT_TYPE_STATE && m.state.conversion_specifier == PRINTF_CONVERSION_S) {
+              std::string ss = m.state.str;
+              if (m.state.precision > 0 && (static_cast<size_t>(m.state.precision) < ss.size())) {
+                ss.resize(m.state.precision);
+              }
+              if (m.state.min_width > 0 && (static_cast<size_t>(m.state.min_width) > ss.size())) {
+                std::string spaces;
+                spaces.resize(static_cast<size_t>(m.state.min_width)  - ss.size(), ' ');
+                if (m.state.left_justified) {
+                  ss = ss + spaces;
+                } else {
+                  ss = spaces + ss;
+                }
+              }
+
+              s += ss;
+            } else {
+              s += m.str;
+            }
+          }
+
+          all_printf.insert(std::pair<uint32_t, std::string>(iter->first, s));
+          n++;
+        }
+
+        return n;
+      }
+
     private:
       std::map<uint32_t, PrintfFmt> fmts;
       friend struct LockOutput;
diff --git a/backend/src/llvm/llvm_printf_parser.cpp b/backend/src/llvm/llvm_printf_parser.cpp
index 6bb7c52..b8c6114 100644
--- a/backend/src/llvm/llvm_printf_parser.cpp
+++ b/backend/src/llvm/llvm_printf_parser.cpp
@@ -245,16 +245,18 @@  again:
 
       /* Now parse the % start conversion_specifier. */
       ret_char = __parse_printf_state(p, end, &rend, &state);
-      if (ret_char < 0)
+      if (ret_char < 0) {
         goto error;
+      } else {
+        std::string s(p, size_t(rend - p));
+        printf_fmt->push_back(PrintfSlot(state, s));
+        num++;
 
-      printf_fmt->push_back(state);
-      num++;
-
-      if (rend == end)
-        break;
+        if (rend == end)
+          break;
 
-      begin = rend;
+        begin = rend;
+      }
     }
 
 #if 0
diff --git a/runtime/gen/CMakeLists.txt b/runtime/gen/CMakeLists.txt
index 83ed0c6..142f1b3 100644
--- a/runtime/gen/CMakeLists.txt
+++ b/runtime/gen/CMakeLists.txt
@@ -78,6 +78,7 @@  set(OPENCL_GEN_SRC
 	cl_compiler_gen.c
 	cl_event_gen.c
 	cl_sampler_gen.c
+	cl_printf_gen.c
 	)
 
 if (X11_FOUND)
diff --git a/runtime/gen/cl_command_queue_gen.c b/runtime/gen/cl_command_queue_gen.c
index 0cb19f9..162c4a5 100644
--- a/runtime/gen/cl_command_queue_gen.c
+++ b/runtime/gen/cl_command_queue_gen.c
@@ -82,6 +82,14 @@  typedef struct gen_gpgpu {
   } mem;
 
   struct {
+    uint32_t printf_buf_size;
+    drm_intel_bo *printf_bo; /* Printf buffer */
+    uint32_t printf_num;
+    cl_uint *printf_ids;
+    char **printf_strings;
+  } printf;
+
+  struct {
     uint64_t sampler_bitmap; /* sampler usage bitmap. */
   } sampler;
 
@@ -460,6 +468,59 @@  gen_gpgpu_setup_scratch(gen_gpgpu *gpu)
 }
 
 static cl_int
+gen_gpgpu_setup_printf_buffer(gen_gpgpu *gpu, cl_kernel_gen kernel_gen, const size_t *global_wk_sz_use)
+{
+  drm_intel_bufmgr *bufmgr = gpu->bufmgr;
+  uint32_t buf_size;
+  cl_uint i;
+
+  if (kernel_gen->printf_num == 0)
+    return CL_SUCCESS;
+
+  /* An guess size. */
+  buf_size = global_wk_sz_use[0] * global_wk_sz_use[1] * global_wk_sz_use[2] *
+             sizeof(int) * 16 * kernel_gen->printf_num;
+  if (buf_size > 16 * 1024 * 1024) //at most.
+    buf_size = 16 * 1024 * 1024;
+  if (buf_size < 1 * 1024 * 1024) // at least.
+    buf_size = 1 * 1024 * 1024;
+
+  gpu->printf.printf_ids = CL_CALLOC(kernel_gen->printf_num, sizeof(cl_uint));
+  gpu->printf.printf_strings = CL_CALLOC(kernel_gen->printf_num, sizeof(char *));
+
+  if (gpu->printf.printf_ids == NULL || gpu->printf.printf_strings == NULL)
+    return CL_OUT_OF_HOST_MEMORY;
+
+  for (i = 0; i < kernel_gen->printf_num; i++) {
+    gpu->printf.printf_ids[i] = kernel_gen->printf_ids[i];
+    gpu->printf.printf_strings[i] = CL_MALLOC(strlen(kernel_gen->printf_strings[i]) + 1);
+    if (gpu->printf.printf_strings[i] == NULL)
+      return CL_OUT_OF_HOST_MEMORY;
+
+    memcpy(gpu->printf.printf_strings[i], kernel_gen->printf_strings[i],
+           strlen(kernel_gen->printf_strings[i]) + 1);
+  }
+
+  gpu->printf.printf_buf_size = buf_size;
+  gpu->printf.printf_num = kernel_gen->printf_num;
+  gpu->printf.printf_bo = drm_intel_bo_alloc(bufmgr, "PRINTF_BO", buf_size, 4096);
+  if (gpu->printf.printf_bo == NULL)
+    return CL_OUT_OF_RESOURCES;
+
+  drm_intel_bo_map(gpu->printf.printf_bo, 1);
+  memset(gpu->printf.printf_bo->virtual, 0, buf_size);
+  *(uint32_t *)(gpu->printf.printf_bo->virtual) = 4; // first four is for the length.
+  drm_intel_bo_unmap(gpu->printf.printf_bo);
+
+  if (gpu->mem.max_bti < kernel_gen->printf_bti)
+    gpu->mem.max_bti = kernel_gen->printf_bti;
+
+  gen_gpgpu_setup_bti(gpu, gpu->printf.printf_bo, 0, buf_size,
+                      kernel_gen->printf_bti, I965_SURFACEFORMAT_RAW);
+  return CL_SUCCESS;
+}
+
+static cl_int
 gen_setup_constant_buffer_for_20(cl_kernel kernel, cl_kernel_gen kernel_gen,
                                  cl_program_gen prog_gen, gen_gpgpu *gpu)
 {
@@ -821,6 +882,19 @@  cl_command_queue_delete_gpgpu(void *gpgpu)
     gpu->mem.scratch_bo = NULL;
   }
 
+  if (gpu->printf.printf_bo) {
+    cl_uint i;
+    assert(gpu->printf.printf_num > 0);
+    for (i = 0; i < gpu->printf.printf_num; i++) {
+      CL_FREE(gpu->printf.printf_strings[i]);
+    }
+    CL_FREE(gpu->printf.printf_strings);
+    CL_FREE(gpu->printf.printf_ids);
+
+    drm_intel_bo_unreference(gpu->printf.printf_bo);
+    gpu->printf.printf_bo = NULL;
+  }
+
   if (gpu->mem.stack_bo) {
     drm_intel_bo_unreference(gpu->mem.stack_bo);
     gpu->mem.stack_bo = NULL;
@@ -988,6 +1062,7 @@  cl_command_queue_ND_range_gen_once(cl_command_queue queue, cl_kernel kernel, cl_
     if (ret != CL_SUCCESS)
       break;
 
+    gen_gpgpu_setup_printf_buffer(gpu, kernel_gen, global_wk_sz_use);
     gen_gpgpu_setup_kernel_exec_svm_mem(kernel, kernel_gen, gpu);
 
     /* also setup the device enqueue helper bo if exist */
@@ -1502,7 +1577,7 @@  cl_command_queue_gen_handle_device_enqueue(cl_command_queue queue, cl_kernel ker
         fixed_global_off[i] = ndrange_info->global_work_offset[i];
     }
 
-//    int *slm_sizes = (int *)ptr;
+    //    int *slm_sizes = (int *)ptr;
     int slm_size = block->descriptor->slm_size;
     ptr += slm_size;
 
@@ -1570,6 +1645,14 @@  cl_command_queue_finish_gpgpu(void *gpgpu)
       return CL_INVALID_VALUE;
 
     intel_batchbuffer_finish(gpu->batch);
+
+    if (gpu->printf.printf_num > 0) {
+      drm_intel_bo_map(gpu->printf.printf_bo, 0);
+      cl_gen_output_printf(gpu->printf.printf_bo->virtual, gpu->printf.printf_buf_size,
+                           gpu->printf.printf_ids, gpu->printf.printf_strings,
+                           gpu->printf.printf_num);
+      drm_intel_bo_unmap(gpu->printf.printf_bo);
+    }
   }
 
   return CL_SUCCESS;
diff --git a/runtime/gen/cl_gen.h b/runtime/gen/cl_gen.h
index 85d8f63..2fcfddd 100644
--- a/runtime/gen/cl_gen.h
+++ b/runtime/gen/cl_gen.h
@@ -173,6 +173,10 @@  typedef struct _cl_kernel_gen {
   cl_gen_image_info_offset image_info;
   cl_uint virt_reg_phy_offset_num; // The mapping between virtual reg and phy offset
   cl_gen_virt_phy_offset virt_reg_phy_offset;
+  cl_uint printf_num;
+  cl_int printf_bti;
+  cl_uint *printf_ids;
+  char **printf_strings;
 } _cl_kernel_gen;
 typedef _cl_kernel_gen *cl_kernel_gen;
 
@@ -191,6 +195,7 @@  enum cl_gen_program_note_type {
   GEN_NOTE_TYPE_CL_INFO = 4,
   GEN_NOTE_TYPE_CL_DEVICE_ENQUEUE_INFO = 5,
   GEN_NOTE_TYPE_COMPILER_INFO = 6,
+  GEN_NOTE_TYPE_CL_PRINTF = 7,
 };
 
 typedef struct _cl_program_gen_device_enqueue_info {
@@ -262,6 +267,7 @@  extern int cl_command_queue_finish_gpgpu(void *gpgpu);
 extern void cl_enqueue_nd_range_delete_gen(cl_event event);
 extern cl_int cl_command_queue_create_gen(cl_device_id device, cl_command_queue queue);
 extern void cl_command_queue_delete_gen(cl_device_id device, cl_command_queue queue);
+extern void cl_gen_output_printf(void *buf_addr, uint32_t buf_size, cl_uint *ids, char **fmts, uint32_t printf_num);
 
 /************************************ Compiler ******************************************/
 extern cl_int cl_compiler_load_gen(cl_device_id device);
diff --git a/runtime/gen/cl_kernel_gen.c b/runtime/gen/cl_kernel_gen.c
index 7ff425e..ce8cbf6 100644
--- a/runtime/gen/cl_kernel_gen.c
+++ b/runtime/gen/cl_kernel_gen.c
@@ -107,6 +107,17 @@  cl_kernel_delete_gen(cl_device_id device, cl_kernel kernel)
     kernel_gen->image_info = NULL;
   }
 
+  if (kernel_gen->printf_ids) {
+    assert(kernel_gen->printf_num > 0);
+    CL_FREE(kernel_gen->printf_ids);
+  }
+  kernel_gen->printf_ids = NULL;
+  if (kernel_gen->printf_strings) {
+    assert(kernel_gen->printf_num > 0);
+    CL_FREE(kernel_gen->printf_strings);
+  }
+  kernel_gen->printf_strings = NULL;
+
   CL_FREE(kernel_gen);
 }
 
@@ -153,56 +164,16 @@  cl_kernel_get_info_gen(cl_device_id device, cl_kernel kernel, cl_uint param_name
 }
 
 static cl_int
-cl_program_gen_get_kernel_func_cl_info(cl_device_id device, cl_kernel kernel)
+cl_program_gen_get_kernel_func_arg_info(cl_kernel kernel, void *desc, cl_uint desc_size,
+                                        cl_program_gen prog_gen, cl_kernel_gen kernel_gen)
 {
-  cl_program prog = kernel->program;
-  cl_program_gen prog_gen;
-  cl_kernel_gen kernel_gen;
-  cl_int offset;
-  void *desc;
   void *ptr;
-  cl_char *name;
-  cl_uint name_size;
-  cl_uint desc_size;
-  cl_uint desc_type;
   cl_uint wg_sz_size;
   cl_uint attr_size;
   cl_uint arg_info_size;
-  int i;
   char *arg_type_qual_str;
   char *arg_access_qualifier_str;
-
-  DEV_PRIVATE_DATA(prog, device, prog_gen);
-  DEV_PRIVATE_DATA(kernel, device, kernel_gen);
-
-  assert(kernel->name);
-
-  if (prog_gen->func_cl_info == NULL)
-    return CL_SUCCESS;
-
-  offset = 0;
-  desc = NULL;
-  while (offset < prog_gen->func_cl_info_data->d_size) {
-    name_size = *(cl_uint *)(prog_gen->func_cl_info_data->d_buf + offset);
-    desc_size = *(cl_uint *)(prog_gen->func_cl_info_data->d_buf + offset + sizeof(cl_uint));
-    desc_type = *(cl_uint *)(prog_gen->func_cl_info_data->d_buf + offset + 2 * sizeof(cl_uint));
-    name = prog_gen->func_cl_info_data->d_buf + offset + sizeof(cl_uint) * 3;
-
-    if (desc_type != GEN_NOTE_TYPE_CL_INFO) {
-      offset += 3 * sizeof(cl_uint) + ALIGN(name_size, 4) + ALIGN(desc_size, 4);
-      continue;
-    }
-
-    if (strcmp((char *)name, (char *)kernel->name) == 0) { // Find the kernel info slot
-      desc = prog_gen->func_cl_info_data->d_buf + offset + sizeof(cl_uint) * 3 + ALIGN(name_size, 4);
-      break;
-    }
-
-    offset += 3 * sizeof(cl_uint) + ALIGN(name_size, 4) + ALIGN(desc_size, 4);
-  }
-
-  if (desc == NULL)
-    return CL_SUCCESS;
+  int i;
 
   ptr = desc;
   attr_size = *(cl_uint *)ptr;
@@ -316,6 +287,80 @@  cl_program_gen_get_kernel_func_cl_info(cl_device_id device, cl_kernel kernel)
 }
 
 static cl_int
+cl_program_gen_get_kernel_func_cl_info(cl_device_id device, cl_kernel kernel)
+{
+  cl_program prog = kernel->program;
+  cl_program_gen prog_gen;
+  cl_kernel_gen kernel_gen;
+  cl_uint name_size;
+  cl_uint desc_size;
+  void *desc;
+  cl_uint desc_type;
+  cl_int offset;
+  cl_char *name;
+  int i;
+  cl_int ret = CL_SUCCESS;
+  cl_bool already_set = CL_FALSE;
+
+  DEV_PRIVATE_DATA(prog, device, prog_gen);
+  DEV_PRIVATE_DATA(kernel, device, kernel_gen);
+
+  assert(kernel->name);
+
+  if (prog_gen->func_cl_info == NULL)
+    return CL_SUCCESS;
+
+  offset = 0;
+  desc = NULL;
+  while (offset < prog_gen->func_cl_info_data->d_size) {
+    name_size = *(cl_uint *)(prog_gen->func_cl_info_data->d_buf + offset);
+    desc_size = *(cl_uint *)(prog_gen->func_cl_info_data->d_buf + offset + sizeof(cl_uint));
+    desc_type = *(cl_uint *)(prog_gen->func_cl_info_data->d_buf + offset + 2 * sizeof(cl_uint));
+    name = prog_gen->func_cl_info_data->d_buf + offset + sizeof(cl_uint) * 3;
+    desc = prog_gen->func_cl_info_data->d_buf + offset + sizeof(cl_uint) * 3 + ALIGN(name_size, 4);
+
+    if (strcmp((char *)name, (char *)kernel->name) != 0) { // Find the kernel info slot
+      offset += 3 * sizeof(cl_uint) + ALIGN(name_size, 4) + ALIGN(desc_size, 4);
+      continue;
+    }
+
+    if (desc_type == GEN_NOTE_TYPE_CL_PRINTF) {
+      kernel_gen->printf_bti = *(cl_uint *)desc;
+      desc += sizeof(cl_uint);
+      kernel_gen->printf_num = *(cl_uint *)desc;
+      desc += sizeof(cl_uint);
+
+      kernel_gen->printf_strings = CL_CALLOC(kernel_gen->printf_num, sizeof(char *));
+      kernel_gen->printf_ids = CL_CALLOC(kernel_gen->printf_num, sizeof(cl_uint));
+
+      if (kernel_gen->printf_strings == NULL)
+        return CL_OUT_OF_HOST_MEMORY;
+      if (kernel_gen->printf_ids == NULL)
+        return CL_OUT_OF_HOST_MEMORY;
+
+      for (i = 0; i < kernel_gen->printf_num; i++) {
+        kernel_gen->printf_ids[i] = *(cl_uint *)desc;
+        desc += sizeof(cl_uint);
+        kernel_gen->printf_strings[i] = desc;
+        desc += strlen(desc) + 1;
+      }
+    } else if (desc_type == GEN_NOTE_TYPE_CL_INFO) {
+      if (already_set) {
+        /* Can not contain two CL info for one kernel */
+        return CL_INVALID_KERNEL_DEFINITION;
+      }
+      ret = cl_program_gen_get_kernel_func_arg_info(kernel, desc, desc_size, prog_gen, kernel_gen);
+      if (ret != CL_SUCCESS)
+        return ret;
+    }
+
+    offset += 3 * sizeof(cl_uint) + ALIGN(name_size, 4) + ALIGN(desc_size, 4);
+  }
+
+  return CL_SUCCESS;
+}
+
+static cl_int
 cl_program_gen_get_one_kernel_func(cl_device_id device, cl_kernel kernel, GElf_Sym *p_sym_entry)
 {
   cl_program prog = kernel->program;
diff --git a/runtime/gen/cl_printf_gen.c b/runtime/gen/cl_printf_gen.c
new file mode 100644
index 0000000..7e6f182
--- /dev/null
+++ b/runtime/gen/cl_printf_gen.c
@@ -0,0 +1,633 @@ 
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "cl_gen.h"
+#include <stdarg.h>
+
+#define GEN_PRINTF_LOG_MAGIC 0xAABBCCDD
+
+typedef struct _cl_gen_printf_log {
+  uint32_t magic;         // 0xAABBCCDD as magic for ASSERT.
+  uint32_t size;          // Size of this printf log, include header.
+  uint32_t statement_num; // which printf within one kernel.
+  char *content;
+} _cl_gen_printf_log;
+typedef _cl_gen_printf_log *cl_gen_printf_log;
+
+/* Things about printf info. */
+enum {
+  GEN_PRINTF_LM_NONE,
+  GEN_PRINTF_LM_HH,
+  GEN_PRINTF_LM_H,
+  GEN_PRINTF_LM_L,
+  GEN_PRINTF_LM_HL,
+};
+
+enum {
+  GEN_PRINTF_CONVERSION_INVALID,
+  GEN_PRINTF_CONVERSION_D,
+  GEN_PRINTF_CONVERSION_I,
+  GEN_PRINTF_CONVERSION_O,
+  GEN_PRINTF_CONVERSION_U,
+  GEN_PRINTF_CONVERSION_X,
+  GEN_PRINTF_CONVERSION_x,
+  GEN_PRINTF_CONVERSION_F,
+  GEN_PRINTF_CONVERSION_f,
+  GEN_PRINTF_CONVERSION_E,
+  GEN_PRINTF_CONVERSION_e,
+  GEN_PRINTF_CONVERSION_G,
+  GEN_PRINTF_CONVERSION_g,
+  GEN_PRINTF_CONVERSION_A,
+  GEN_PRINTF_CONVERSION_a,
+  GEN_PRINTF_CONVERSION_C,
+  GEN_PRINTF_CONVERSION_S,
+  GEN_PRINTF_CONVERSION_P
+};
+
+typedef struct _gen_printf_state {
+  struct _gen_printf_state *next;
+  cl_int left_justified;
+  cl_int sign_symbol; //0 for nothing, 1 for sign, 2 for space.
+  cl_int alter_form;
+  cl_int zero_padding;
+  cl_int vector_n;
+  cl_int min_width;
+  cl_int precision;
+  cl_int length_modifier;
+  cl_int conversion_specifier;
+  char *str;
+} _gen_printf_state;
+typedef _gen_printf_state *gen_printf_state;
+
+static char *
+generate_printf_fmt(gen_printf_state state)
+{
+  char num_str[16];
+  char *str = CL_CALLOC(1, 256);
+  int len = 0;
+
+  str[len] = '%';
+  len++;
+
+  if (state->left_justified) {
+    str[len] = '-';
+    len++;
+  }
+
+  if (state->sign_symbol == 1) {
+    str[len] = '+';
+    len++;
+  } else if (state->sign_symbol == 2) {
+    str[len] = ' ';
+    len++;
+  }
+
+  if (state->alter_form) {
+    str[len] = '#';
+    len++;
+  }
+
+  if (state->zero_padding) {
+    str[len] = '0';
+    len++;
+  }
+
+  if (state->min_width >= 0) {
+    snprintf(num_str, 16, "%d", state->min_width);
+    memcpy(&(str[len]), num_str, strlen(num_str));
+    len += strlen(num_str);
+  }
+
+  if (state->precision >= 0) {
+    str[len] = '.';
+    len++;
+    memcpy(&(str[len]), num_str, strlen(num_str));
+    len += strlen(num_str);
+  }
+
+  switch (state->length_modifier) {
+  case GEN_PRINTF_LM_HH:
+    str[len] = 'h';
+    len++;
+    str[len] = 'h';
+    len++;
+    break;
+  case GEN_PRINTF_LM_H:
+    str[len] = 'h';
+    len++;
+    break;
+  case GEN_PRINTF_LM_L:
+    str[len] = 'l';
+    len++;
+    break;
+  case GEN_PRINTF_LM_HL:
+    break;
+  default:
+    assert(state->length_modifier == GEN_PRINTF_LM_NONE);
+  }
+
+  switch (state->conversion_specifier) {
+  case GEN_PRINTF_CONVERSION_D:
+  case GEN_PRINTF_CONVERSION_I:
+    str[len] = 'd';
+    break;
+
+  case GEN_PRINTF_CONVERSION_O:
+    str[len] = 'o';
+    break;
+  case GEN_PRINTF_CONVERSION_U:
+    str[len] = 'u';
+    break;
+  case GEN_PRINTF_CONVERSION_X:
+    str[len] = 'X';
+    break;
+  case GEN_PRINTF_CONVERSION_x:
+    str[len] = 'x';
+    break;
+  case GEN_PRINTF_CONVERSION_C:
+    str[len] = 'c';
+    break;
+  case GEN_PRINTF_CONVERSION_F:
+    str[len] = 'F';
+    break;
+  case GEN_PRINTF_CONVERSION_f:
+    str[len] = 'f';
+    break;
+  case GEN_PRINTF_CONVERSION_E:
+    str[len] = 'E';
+    break;
+  case GEN_PRINTF_CONVERSION_e:
+    str[len] = 'e';
+    break;
+  case GEN_PRINTF_CONVERSION_G:
+    str[len] = 'G';
+    break;
+  case GEN_PRINTF_CONVERSION_g:
+    str[len] = 'g';
+    break;
+  case GEN_PRINTF_CONVERSION_A:
+    str[len] = 'A';
+    break;
+  case GEN_PRINTF_CONVERSION_a:
+    str[len] = 'a';
+    break;
+  case GEN_PRINTF_CONVERSION_P:
+    str[len] = 'p';
+    break;
+  default:
+    assert(0);
+    break;
+  }
+
+  return str;
+}
+
+static cl_int
+parse_printf_state(char *begin, char *end, char **rend, gen_printf_state state)
+{
+  const char *fmt;
+  state->left_justified = 0;
+  state->sign_symbol = 0; //0 for nothing, 1 for sign, 2 for space.
+  state->alter_form = 0;
+  state->zero_padding = 0;
+  state->vector_n = 0;
+  state->min_width = -1;
+  state->precision = -1;
+  state->length_modifier = GEN_PRINTF_LM_NONE;
+  state->conversion_specifier = GEN_PRINTF_CONVERSION_INVALID;
+
+  fmt = begin;
+
+  if (*fmt != '%')
+    return -1;
+
+#define FMT_PLUS_PLUS                                   \
+  do {                                                  \
+    if (fmt + 1 <= end)                                 \
+      fmt++;                                            \
+    else {                                              \
+      printf("Error, line: %d, fmt > end\n", __LINE__); \
+      return -1;                                        \
+    }                                                   \
+  } while (0)
+
+  FMT_PLUS_PLUS;
+
+  // parse the flags.
+  while (*fmt == '-' || *fmt == '+' || *fmt == ' ' || *fmt == '#' || *fmt == '0')
+    switch (*fmt) {
+    case '-':
+      /* The result of the conversion is left-justified within the field. */
+      state->left_justified = 1;
+      FMT_PLUS_PLUS;
+      break;
+    case '+':
+      /* The result of a signed conversion always begins with a plus or minus sign. */
+      state->sign_symbol = 1;
+      FMT_PLUS_PLUS;
+      break;
+    case ' ':
+      /* If the first character of a signed conversion is not a sign, or if a signed
+         conversion results in no characters, a space is prefixed to the result.
+         If the space and + flags both appear,the space flag is ignored. */
+      if (state->sign_symbol == 0)
+        state->sign_symbol = 2;
+      FMT_PLUS_PLUS;
+      break;
+    case '#':
+      /*The result is converted to an alternative form. */
+      state->alter_form = 1;
+      FMT_PLUS_PLUS;
+      break;
+    case '0':
+      if (!state->left_justified)
+        state->zero_padding = 1;
+      FMT_PLUS_PLUS;
+      break;
+    default:
+      break;
+    }
+
+  // The minimum field width
+  while ((*fmt >= '0') && (*fmt <= '9')) {
+    if (state->min_width < 0)
+      state->min_width = 0;
+    state->min_width = state->min_width * 10 + (*fmt - '0');
+    FMT_PLUS_PLUS;
+  }
+
+  // The precision
+  if (*fmt == '.') {
+    FMT_PLUS_PLUS;
+    state->precision = 0;
+    while (*fmt >= '0' && *fmt <= '9') {
+      state->precision = state->precision * 10 + (*fmt - '0');
+      FMT_PLUS_PLUS;
+    }
+  }
+
+  // handle the vector specifier.
+  if (*fmt == 'v') {
+    FMT_PLUS_PLUS;
+    switch (*fmt) {
+    case '2':
+    case '3':
+    case '4':
+    case '8':
+      state->vector_n = *fmt - '0';
+      FMT_PLUS_PLUS;
+      break;
+    case '1':
+      FMT_PLUS_PLUS;
+      if (*fmt == '6') {
+        state->vector_n = 16;
+        FMT_PLUS_PLUS;
+      } else
+        return -1;
+      break;
+    default:
+      //Wrong vector, error.
+      return -1;
+    }
+  }
+
+  // length modifiers
+  if (*fmt == 'h') {
+    FMT_PLUS_PLUS;
+    if (*fmt == 'h') { //hh
+      state->length_modifier = GEN_PRINTF_LM_HH;
+      FMT_PLUS_PLUS;
+    } else if (*fmt == 'l') { //hl
+      state->length_modifier = GEN_PRINTF_LM_HL;
+      FMT_PLUS_PLUS;
+    } else { //h
+      state->length_modifier = GEN_PRINTF_LM_H;
+    }
+  } else if (*fmt == 'l') {
+    state->length_modifier = GEN_PRINTF_LM_L;
+    FMT_PLUS_PLUS;
+  }
+
+#define CONVERSION_SPEC_AND_RET(XXX, xxx)                      \
+  case XXX:                                                    \
+    state->conversion_specifier = GEN_PRINTF_CONVERSION_##xxx; \
+    FMT_PLUS_PLUS;                                             \
+    *rend = (char *)fmt;                                       \
+    return XXX;                                                \
+    break;
+
+  // conversion specifiers
+  switch (*fmt) {
+    CONVERSION_SPEC_AND_RET('d', D)
+    CONVERSION_SPEC_AND_RET('i', I)
+    CONVERSION_SPEC_AND_RET('o', O)
+    CONVERSION_SPEC_AND_RET('u', U)
+    CONVERSION_SPEC_AND_RET('x', x)
+    CONVERSION_SPEC_AND_RET('X', X)
+    CONVERSION_SPEC_AND_RET('f', f)
+    CONVERSION_SPEC_AND_RET('F', F)
+    CONVERSION_SPEC_AND_RET('e', e)
+    CONVERSION_SPEC_AND_RET('E', E)
+    CONVERSION_SPEC_AND_RET('g', g)
+    CONVERSION_SPEC_AND_RET('G', G)
+    CONVERSION_SPEC_AND_RET('a', a)
+    CONVERSION_SPEC_AND_RET('A', A)
+    CONVERSION_SPEC_AND_RET('c', C)
+    CONVERSION_SPEC_AND_RET('s', S)
+    CONVERSION_SPEC_AND_RET('p', P)
+
+  // %% has been handled
+
+  default:
+    return -1;
+  }
+}
+
+static void
+free_printf_state(gen_printf_state state)
+{
+  gen_printf_state s;
+
+  while (state) {
+    s = state->next;
+
+    if (state->str)
+      CL_FREE(state->str);
+
+    CL_FREE(state);
+    state = s;
+  }
+}
+
+static gen_printf_state
+parser_printf_fmt(char *format)
+{
+  char *begin;
+  char *end;
+  char *p;
+  char ret_char;
+  char *rend;
+  gen_printf_state curr, prev, first;
+
+  p = format;
+  begin = format;
+  end = format + strlen(format);
+  first = NULL;
+  prev = NULL;
+
+  /* Now parse it. */
+  while (*begin) {
+    p = begin;
+
+  again:
+    while (p < end && *p != '%') {
+      p++;
+    }
+    if (p < end && p + 1 == end) { // String with % at end.
+      printf("string end with %%\n");
+      goto error;
+    }
+    if (p + 1 < end && *(p + 1) == '%') { // %%
+      p += 2;
+      goto again;
+    }
+
+    if (p != begin) {
+      curr = CL_CALLOC(1, sizeof(_gen_printf_state));
+      curr->conversion_specifier = GEN_PRINTF_CONVERSION_S;
+      curr->str = CL_MALLOC(p - begin + 1);
+      memcpy(curr->str, begin, p - begin);
+
+      curr->str[p - begin] = 0;
+      if (first == NULL) {
+        first = curr;
+      }
+      if (prev) {
+        prev->next = curr;
+      }
+      prev = curr;
+    }
+
+    if (p == end) // finish
+      break;
+
+    /* Now parse the % start conversion_specifier. */
+    curr = CL_CALLOC(1, sizeof(_gen_printf_state));
+    ret_char = parse_printf_state(p, end, &rend, curr);
+    if (ret_char < 0) {
+      goto error;
+    }
+
+    if (curr->vector_n > 0) {
+      curr->str = generate_printf_fmt(curr); // Standard printf can not recognize %v4XXX
+    } else {
+      curr->str = CL_MALLOC(rend - p + 1);
+      memcpy(curr->str, p, rend - p);
+      curr->str[rend - p] = 0;
+    }
+
+    if (first == NULL) {
+      first = curr;
+    }
+    if (prev) {
+      prev->next = curr;
+    }
+    prev = curr;
+
+    if (rend == end)
+      break;
+
+    begin = rend;
+  }
+
+#if 0
+  {
+    cl_int j = 0;
+    gen_printf_state s = first;
+    while (s) {
+      fprintf(stderr, "---- %d ---- state : \n", j);
+      fprintf(stderr, "             conversion_specifier : %d\n", s->conversion_specifier);
+      fprintf(stderr, "             vector_n : %d\n", s->vector_n);
+      fprintf(stderr, "             left_justified : %d\n", s->left_justified);
+      fprintf(stderr, "             sign_symbol: %d\n", s->sign_symbol);
+      fprintf(stderr, "             alter_form : %d\n", s->alter_form);
+      fprintf(stderr, "             zero_padding : %d\n", s->zero_padding);
+      fprintf(stderr, "             min_width : %d\n", s->min_width);
+      fprintf(stderr, "             precision : %d\n", s->precision);
+      fprintf(stderr, "             length_modifier : %d\n", s->length_modifier);
+      fprintf(stderr, "             string :  %s      strlen is %ld\n", s->str, strlen(s->str));
+      j++;
+      s = s->next;
+    }
+  }
+#endif
+
+  return first;
+
+error:
+  printf("error format string.\n");
+  free_printf_state(first);
+  return NULL;
+}
+
+static void
+output_one_printf(gen_printf_state all_state, cl_gen_printf_log log)
+{
+#define PRINT_SOMETHING(target_ty)                      \
+  do {                                                  \
+    printf(s->str, *(target_ty *)(data + data_offset)); \
+    data_offset += sizeof(target_ty);                   \
+  } while (0)
+
+  gen_printf_state s = all_state;
+  cl_int vec_num, vec_i;
+  char *data = (char *)(log) + 3 * sizeof(uint32_t);
+  size_t data_offset = 0;
+
+  while (s) {
+    if (s->conversion_specifier == GEN_PRINTF_CONVERSION_S) {
+      printf("%s", s->str);
+      s = s->next;
+      continue;
+    }
+
+    vec_num = s->vector_n > 0 ? s->vector_n : 1;
+    for (vec_i = 0; vec_i < vec_num; vec_i++) {
+      if (vec_i)
+        printf(",");
+
+      switch (s->conversion_specifier) {
+      case GEN_PRINTF_CONVERSION_D:
+      case GEN_PRINTF_CONVERSION_I:
+        if (s->length_modifier == GEN_PRINTF_LM_L)
+          PRINT_SOMETHING(uint64_t);
+        else
+          PRINT_SOMETHING(int);
+        break;
+
+      case GEN_PRINTF_CONVERSION_O:
+        if (s->length_modifier == GEN_PRINTF_LM_L)
+          PRINT_SOMETHING(uint64_t);
+        else
+          PRINT_SOMETHING(int);
+        break;
+      case GEN_PRINTF_CONVERSION_U:
+        if (s->length_modifier == GEN_PRINTF_LM_L)
+          PRINT_SOMETHING(uint64_t);
+        else
+          PRINT_SOMETHING(int);
+        break;
+      case GEN_PRINTF_CONVERSION_X:
+        if (s->length_modifier == GEN_PRINTF_LM_L)
+          PRINT_SOMETHING(uint64_t);
+        else
+          PRINT_SOMETHING(int);
+        break;
+      case GEN_PRINTF_CONVERSION_x:
+        if (s->length_modifier == GEN_PRINTF_LM_L)
+          PRINT_SOMETHING(uint64_t);
+        else
+          PRINT_SOMETHING(int);
+        break;
+
+      case GEN_PRINTF_CONVERSION_C:
+        PRINT_SOMETHING(char);
+        break;
+
+      case GEN_PRINTF_CONVERSION_F:
+        PRINT_SOMETHING(float);
+        break;
+      case GEN_PRINTF_CONVERSION_f:
+        PRINT_SOMETHING(float);
+        break;
+      case GEN_PRINTF_CONVERSION_E:
+        PRINT_SOMETHING(float);
+        break;
+      case GEN_PRINTF_CONVERSION_e:
+        PRINT_SOMETHING(float);
+        break;
+      case GEN_PRINTF_CONVERSION_G:
+        PRINT_SOMETHING(float);
+        break;
+      case GEN_PRINTF_CONVERSION_g:
+        PRINT_SOMETHING(float);
+        break;
+      case GEN_PRINTF_CONVERSION_A:
+        PRINT_SOMETHING(float);
+        break;
+      case GEN_PRINTF_CONVERSION_a:
+        PRINT_SOMETHING(float);
+        break;
+      case GEN_PRINTF_CONVERSION_P:
+        PRINT_SOMETHING(int);
+        break;
+
+      default:
+        assert(0);
+        return;
+      }
+    }
+
+    s = s->next;
+  }
+}
+
+LOCAL void
+cl_gen_output_printf(void *buf_addr, uint32_t buf_size, cl_uint *ids,
+                     char **fmts, uint32_t printf_num)
+{
+  uint32_t parsed;
+  uint32_t total_sz = ((uint32_t *)buf_addr)[0];
+  char *p = (char *)buf_addr + sizeof(uint32_t);
+  uint32_t i;
+  gen_printf_state all_states;
+
+  if (total_sz > buf_size)
+    total_sz = buf_size;
+
+  for (parsed = 4; parsed < total_sz;) {
+    cl_gen_printf_log log = (cl_gen_printf_log)(p);
+    if (log->magic != GEN_PRINTF_LOG_MAGIC) {
+      CL_LOG_ERROR("Printf log output has wrong magic");
+      return;
+    }
+
+    for (i = 0; i < printf_num; i++) {
+      if (ids[i] == log->statement_num)
+        break;
+    }
+    if (i == printf_num) {
+      CL_LOG_ERROR("Printf log output, can not find the printf statement for %d",
+                   log->statement_num);
+      return;
+    }
+
+    all_states = parser_printf_fmt(fmts[i]);
+    if (all_states == NULL) {
+      CL_LOG_ERROR("Printf statement %d with wrong format %s",
+                   log->statement_num, fmts[i]);
+      continue;
+    }
+
+    output_one_printf(all_states, log);
+    free_printf_state(all_states);
+
+    parsed += log->size;
+    p += log->size;
+  }
+}